diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250127-004222/args.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250127-004222/args.json new file mode 100644 index 0000000000000000000000000000000000000000..01d640ec7d302fc70ab6a0a505562a4a07ddb606 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250127-004222/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250127-004222", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250127-004222/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-7b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-7b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250127-004222', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250127-004222/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250127-004222', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250127-004222/logging.jsonl b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250127-004222/logging.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ea61caa3fbb10d945a51604958c3f2f53e25530f --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250127-004222/logging.jsonl @@ -0,0 +1,4 @@ +{"loss": 2.46972656, "grad_norm": 9.15848408, "learning_rate": 7.69e-06, "memory(GiB)": 8.4, "train_speed(iter/s)": 0.121692, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/rejected": -142.0, "logps/chosen": -520.0, "logits/rejected": -2.546875, "logits/chosen": 2.390625, "nll_loss": 0.51171875, "epoch": 0.02, "global_step/max_steps": "1/250", "percentage": "0.40%", "elapsed_time": "5s", "remaining_time": "21m 50s"} +{"loss": 2.40649414, "grad_norm": 17.66065744, "learning_rate": 3.846e-05, "memory(GiB)": 12.04, "train_speed(iter/s)": 0.270987, "rewards/chosen": 0.125, "rewards/rejected": 0.13769531, "rewards/accuracies": 0.25, "rewards/margins": -0.01251221, "logps/rejected": -304.0, "logps/chosen": -728.0, "logits/rejected": -0.8671875, "logits/chosen": 0.14746094, "nll_loss": 1.1484375, "epoch": 0.1, "global_step/max_steps": "5/250", "percentage": "2.00%", "elapsed_time": "15s", "remaining_time": "12m 38s"} +{"loss": 2.21494141, "grad_norm": 8.88501998, "learning_rate": 7.692e-05, "memory(GiB)": 17.87, "train_speed(iter/s)": 0.308513, "rewards/chosen": 0.16992188, "rewards/rejected": 0.12011719, "rewards/accuracies": 0.40000001, "rewards/margins": 0.05004883, "logps/rejected": -572.0, "logps/chosen": -644.0, "logits/rejected": 1.296875, "logits/chosen": -0.44140625, "nll_loss": 1.5, "epoch": 0.2, "global_step/max_steps": "10/250", "percentage": "4.00%", "elapsed_time": "29s", "remaining_time": "11m 46s"} +{"loss": 1.74472656, "grad_norm": 6.0487064, "learning_rate": 9.998e-05, "memory(GiB)": 31.89, "train_speed(iter/s)": 0.315272, "rewards/chosen": 2.53125, "rewards/rejected": 0.40039062, "rewards/accuracies": 1.0, "rewards/margins": 2.125, "logps/rejected": -1552.0, "logps/chosen": -924.0, "logits/rejected": 1.828125, "logits/chosen": -0.74609375, "nll_loss": 1.4609375, "epoch": 0.3, "global_step/max_steps": "15/250", "percentage": "6.00%", "elapsed_time": "44s", "remaining_time": "11m 38s"} diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250127-004222/runs/events.out.tfevents.1737938614.kml-task-547024-record-9975763-prod-worker-0.98735.0 b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250127-004222/runs/events.out.tfevents.1737938614.kml-task-547024-record-9975763-prod-worker-0.98735.0 new file mode 100644 index 0000000000000000000000000000000000000000..18c1a1b5e86f7ec6c4763e49a3ce20a201fec72d --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v0-20250127-004222/runs/events.out.tfevents.1737938614.kml-task-547024-record-9975763-prod-worker-0.98735.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:935ce90e0b2449be4342e4f92ac0fdfde1d5b64f1cd308c9aa71594a2cde8cde +size 10985 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/args.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/args.json new file mode 100644 index 0000000000000000000000000000000000000000..4b016b97ef2d4589dae9d4623f25925cb829d241 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-7b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-7b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/README.md b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..525e0a2a5243ae77491e227cbf74fa8ff0a43579 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-7b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/adapter_config.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..454fa4ee7fc180fe11671ee7f6c90d0ba44328a6 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "gate_proj", + "up_proj", + "o_proj", + "v_proj", + "down_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/adapter_model.safetensors b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9733cccd9ea5deecc329271d8c932c6c06532462 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de8b8fabadec09702a89789018a18bca23053fddd03898ae0f99f05c8170ca30 +size 40422208 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/additional_config.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/args.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/args.json new file mode 100644 index 0000000000000000000000000000000000000000..4b016b97ef2d4589dae9d4623f25925cb829d241 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-7b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-7b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d2f22a46c55b11a9477148548fdee5e032416777 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f554d663d233c6a2f083ae805d6ef0a8c6b0fc11938ef136a8872eb783be131 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ecdb57a1973316cb67114bf7dff9127e3a5b1b47 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:879cfeefb6f50793b212c7f2ff78d26d8cd3c3f28ca406f8635e596b48c62b58 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f1b3e939153952351c5e743a5cb926cc8b3465ad --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f4f11de66ddf01852eddbc8b0fc3b39af4927601e9ba4311b132b59aa3228c7 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6157ab691f0f2be79255d16f8db26d1ce44b8613 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6897972f4dcc562a1157d8d3bf671419d73762cdc42a9ee1031bbd1ba4c346e9 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c506d6c0b82b95c4d2df99d4180a3a16d2758c33 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba09f3f0f6c92e4f9e414f0aa25780462766d901efad07c7fc631a6cbc550a3a +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3948c55a00bbc08f5737b67c32759ad75ec55ad4 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b314817327a4570dfcf2ec59dbf72091c6f00f6ff41b4ba5450ab933fed2e57c +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..580dc5a8a98a1c0e19ed3f40c52b25c371b250ea --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2a0907b5acac2138a23e4dcc18ca46a9e895ca9f34711167aab1e5b72025f37 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e9d371539593c144b5f58c0086e6d416a62383f --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a322289b3fba9e440a89b6870c2e7e237fa2042c31e93d6bcf38f630a58163b +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eaffdcf57044390ca8e061bbed30368f0b158d87 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55b7b79aa4c8e5559132cddf3296127adb4b87a4c117504c81ceec04191001f6 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b16b14e208d73a5f130103d89f1cc553f9eea14f --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b43b98161b1c1ebc20b36561ac01f0a60a09f3c8c1de673396e906004ffdbb3c +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5ab11c397a32ff009bd2643ca4650485e2d56e30 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c4c3bb47f1c1285cccb02cc059bc947399f5a67b3a95cd24771e84aca736474 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2909b4b5b5cefe0bbc1a485565551c0165217822 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5115d603db725df285fc537f54a21d19f020912b5850fde2055f0f8c086138f7 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0d5694d06b34ea2b18a0b9f3f12194ad3c1c9e35 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3863047585aa6723495789d011b6635fecf8822d59d97ad5616b251fc734137 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a730bf91a609a3e699f25911df955b63247b955 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ace43fde9ac6f40de46bf6d58bf06d7d3233958481fabc2ecd55de51aa3d768 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a83c7cdb2f8370e3515f12ea338bed4fece5ac3 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d2d780feced14807d6c564c078bae76f97d5df0c262e37c5fc8c9008c957df3 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a428358996083fc9b823ad66f85ae0e15a5fd1aa --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/global_step100/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dd1868b1ce5c2319066c8a53d5c3605743ec3d044b701decc8da45a64502d85 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/latest b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/latest new file mode 100644 index 0000000000000000000000000000000000000000..744ae7dbad571b6f37ec6c7066549494261bb59e --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/latest @@ -0,0 +1 @@ +global_step100 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/rng_state_0.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..572d9bd86f4559e91e7b9a4fdc47494e5c6e9568 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7d7e02ffb4d440dce7ab4ce0b5617578ec9ce3672acee7434ed6f1153f1ae0c +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/rng_state_1.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..0d21df4c1d8717a3994f151fbc05460a0172725e --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5b40ca759e432b2688f021b81291d74a40f56a205e9842119f7e772275eebd3 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/rng_state_2.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..6048bfa1e35e3b563aec9f5c1c6788496c3f068d --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdaef955ddd36d6bc1c40584113dd6205483e2aa85b02439b8b27e82e02a8359 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/rng_state_3.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..3722ed81a034ae380c794d8b45b0464c00099aa6 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10b14ae5db356e6512538751d6b386c190754e307cc99cd652d5c6dd891e1f82 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/rng_state_4.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..13231ff967baa9c056d5a7ec0cc489a62679039c --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f26e28be26826eeeed244b77185c67b443ac185175f8d4bf5ba94caa8b271bc5 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/rng_state_5.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..f3401deecf687fd1382dae699b8d2e1a52949a4a --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:847cedc1d6ca26f299a132c2ade9754887374acb9d98f26594a85d4c7742d474 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/rng_state_6.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..90ab10b8ff32ba08d69bdf75cb904d226b3d9008 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcd043d1690ae0ff6991b03322799a0b28f021427b15fd9f1e5ed8b9905d9307 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/rng_state_7.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..3c6fb5670c4f108f08c81f04f22272cdd57b7745 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:772190f7e6667c865d25fc72da7bdd1b5d39f46fe03bb5c2d754aee1ad3c99c7 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/scheduler.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..9e44eb3fa45a557460f92c098f2fd1eccee781e9 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:815e2e2737d63e3807ccb5977a1e9e37b3ec45ffad7bcc727e9e137aebc01368 +size 1064 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/trainer_state.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..59dff01dec6d0dc0b82f3533e1a56189a2e6a282 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/trainer_state.json @@ -0,0 +1,496 @@ +{ + "best_metric": 0.50097656, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80", + "epoch": 2.0, + "eval_steps": 20, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 9.15866064284304, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": 2.390625, + "logits/rejected": -2.546875, + "logps/chosen": -520.0, + "logps/rejected": -142.0, + "loss": 2.4697265625, + "memory(GiB)": 8.4, + "nll_loss": 0.51171875, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.126463 + }, + { + "epoch": 0.1, + "grad_norm": 26.585759281524314, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 0.1533203125, + "logits/rejected": -0.8671875, + "logps/chosen": -728.0, + "logps/rejected": -304.0, + "loss": 2.3878173828125, + "memory(GiB)": 12.04, + "nll_loss": 1.1484375, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.125, + "rewards/margins": -0.0250244140625, + "rewards/rejected": 0.150390625, + "step": 5, + "train_speed(iter/s)": 0.276117 + }, + { + "epoch": 0.2, + "grad_norm": 8.664467296577401, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.419921875, + "logits/rejected": 1.2890625, + "logps/chosen": -644.0, + "logps/rejected": -572.0, + "loss": 2.18505859375, + "memory(GiB)": 17.87, + "nll_loss": 1.4921875, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.1103515625, + "rewards/margins": 0.08984375, + "rewards/rejected": 0.0201416015625, + "step": 10, + "train_speed(iter/s)": 0.313547 + }, + { + "epoch": 0.3, + "grad_norm": 6.325430646213694, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.7578125, + "logits/rejected": 1.8203125, + "logps/chosen": -924.0, + "logps/rejected": -1552.0, + "loss": 1.76845703125, + "memory(GiB)": 32.02, + "nll_loss": 1.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.75, + "rewards/margins": 2.390625, + "rewards/rejected": 0.359375, + "step": 15, + "train_speed(iter/s)": 0.318522 + }, + { + "epoch": 0.4, + "grad_norm": 2.351010309283972, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": 0.5859375, + "logits/rejected": -0.4375, + "logps/chosen": -432.0, + "logps/rejected": -324.0, + "loss": 1.172265625, + "memory(GiB)": 32.02, + "nll_loss": 0.6640625, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 4.65625, + "rewards/margins": 1.5546875, + "rewards/rejected": 3.09375, + "step": 20, + "train_speed(iter/s)": 0.331194 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.3125, + "eval_logits/rejected": 0.86328125, + "eval_logps/chosen": -14.3125, + "eval_logps/rejected": -182.0, + "eval_loss": 0.6728515625, + "eval_nll_loss": 0.62109375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.375, + "eval_rewards/margins": 3.5625, + "eval_rewards/rejected": 5.8125, + "eval_runtime": 1.2862, + "eval_samples_per_second": 3.11, + "eval_steps_per_second": 0.778, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.5984307422346672, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": 0.255859375, + "logits/rejected": -0.0390625, + "logps/chosen": -326.0, + "logps/rejected": -420.0, + "loss": 0.8047607421875, + "memory(GiB)": 32.02, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.125, + "rewards/margins": 5.40625, + "rewards/rejected": 3.6875, + "step": 25, + "train_speed(iter/s)": 0.330795 + }, + { + "epoch": 0.6, + "grad_norm": 0.5223734060204478, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -0.21875, + "logits/rejected": 1.546875, + "logps/chosen": -400.0, + "logps/rejected": -656.0, + "loss": 0.6901611328125, + "memory(GiB)": 32.02, + "nll_loss": 0.78125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.5625, + "rewards/margins": 6.78125, + "rewards/rejected": 3.765625, + "step": 30, + "train_speed(iter/s)": 0.339407 + }, + { + "epoch": 0.7, + "grad_norm": 1.0839186250481967, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": 1.5625, + "logits/rejected": -1.578125, + "logps/chosen": -592.0, + "logps/rejected": -322.0, + "loss": 0.7055419921875, + "memory(GiB)": 32.02, + "nll_loss": 0.96875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.75, + "rewards/margins": 9.625, + "rewards/rejected": 3.125, + "step": 35, + "train_speed(iter/s)": 0.346622 + }, + { + "epoch": 0.8, + "grad_norm": 13.025539300060226, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": 0.08740234375, + "logits/rejected": 0.71484375, + "logps/chosen": -225.0, + "logps/rejected": -322.0, + "loss": 0.6363037109375, + "memory(GiB)": 32.02, + "nll_loss": 0.56640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.0625, + "rewards/margins": 7.53125, + "rewards/rejected": 3.546875, + "step": 40, + "train_speed(iter/s)": 0.345794 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -0.8046875, + "eval_logits/rejected": 0.9375, + "eval_logps/chosen": -7.1875, + "eval_logps/rejected": -212.0, + "eval_loss": 0.5283203125, + "eval_nll_loss": 0.3125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 7.3125, + "eval_rewards/rejected": 2.796875, + "eval_runtime": 1.2905, + "eval_samples_per_second": 3.1, + "eval_steps_per_second": 0.775, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.49724108967226127, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": 2.890625, + "logits/rejected": 0.002349853515625, + "logps/chosen": -426.0, + "logps/rejected": -430.0, + "loss": 0.5437744140625, + "memory(GiB)": 32.02, + "nll_loss": 0.470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.0625, + "rewards/margins": 10.6875, + "rewards/rejected": 3.390625, + "step": 45, + "train_speed(iter/s)": 0.342553 + }, + { + "epoch": 1.0, + "grad_norm": 1.4155859018634052, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": 0.2451171875, + "logits/rejected": 1.75, + "logps/chosen": -804.0, + "logps/rejected": -1480.0, + "loss": 0.57803955078125, + "memory(GiB)": 32.02, + "nll_loss": 0.6953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.25, + "rewards/margins": 9.125, + "rewards/rejected": 7.125, + "step": 50, + "train_speed(iter/s)": 0.346029 + }, + { + "epoch": 1.1, + "grad_norm": 0.5694275670231188, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": 1.8359375, + "logits/rejected": 0.703125, + "logps/chosen": -386.0, + "logps/rejected": -612.0, + "loss": 0.6138671875, + "memory(GiB)": 43.66, + "nll_loss": 0.546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.75, + "rewards/margins": 12.25, + "rewards/rejected": 3.53125, + "step": 55, + "train_speed(iter/s)": 0.347089 + }, + { + "epoch": 1.2, + "grad_norm": 0.3472486708219598, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": 1.65625, + "logits/rejected": 2.15625, + "logps/chosen": -233.0, + "logps/rejected": -520.0, + "loss": 0.5795654296875, + "memory(GiB)": 43.66, + "nll_loss": 0.396484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.375, + "rewards/margins": 12.1875, + "rewards/rejected": 3.140625, + "step": 60, + "train_speed(iter/s)": 0.349928 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -0.23828125, + "eval_logits/rejected": 1.703125, + "eval_logps/chosen": -7.3125, + "eval_logps/rejected": -223.0, + "eval_loss": 0.51806640625, + "eval_nll_loss": 0.31640625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 8.4375, + "eval_rewards/rejected": 1.703125, + "eval_runtime": 1.2918, + "eval_samples_per_second": 3.096, + "eval_steps_per_second": 0.774, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.48830819519223906, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": 0.97265625, + "logits/rejected": 1.9921875, + "logps/chosen": -346.0, + "logps/rejected": -844.0, + "loss": 0.6193359375, + "memory(GiB)": 43.66, + "nll_loss": 0.640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5625, + "rewards/margins": 11.875, + "rewards/rejected": 3.703125, + "step": 65, + "train_speed(iter/s)": 0.34619 + }, + { + "epoch": 1.4, + "grad_norm": 0.3461001279478362, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -0.6875, + "logits/rejected": 2.28125, + "logps/chosen": -88.0, + "logps/rejected": -1120.0, + "loss": 0.5681396484375, + "memory(GiB)": 43.66, + "nll_loss": 0.84375, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 11.5625, + "rewards/margins": 9.125, + "rewards/rejected": 2.484375, + "step": 70, + "train_speed(iter/s)": 0.349389 + }, + { + "epoch": 1.5, + "grad_norm": 0.49738864814975586, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": 2.515625, + "logits/rejected": 0.263671875, + "logps/chosen": -462.0, + "logps/rejected": -434.0, + "loss": 0.517205810546875, + "memory(GiB)": 43.66, + "nll_loss": 0.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.625, + "rewards/margins": 16.375, + "rewards/rejected": 2.28125, + "step": 75, + "train_speed(iter/s)": 0.351345 + }, + { + "epoch": 1.6, + "grad_norm": 0.3498527388541882, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": 3.5625, + "logits/rejected": -1.296875, + "logps/chosen": -580.0, + "logps/rejected": -132.0, + "loss": 0.5629638671875, + "memory(GiB)": 43.66, + "nll_loss": 0.703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.5, + "rewards/margins": 15.6875, + "rewards/rejected": 5.84375, + "step": 80, + "train_speed(iter/s)": 0.353922 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -0.68359375, + "eval_logits/rejected": 0.3125, + "eval_logps/chosen": -6.3125, + "eval_logps/rejected": -306.0, + "eval_loss": 0.5009765625, + "eval_nll_loss": 0.2734375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.1875, + "eval_rewards/margins": 16.75, + "eval_rewards/rejected": -6.59375, + "eval_runtime": 1.3254, + "eval_samples_per_second": 3.018, + "eval_steps_per_second": 0.754, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.6649877164961882, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": 0.72265625, + "logits/rejected": 0.388671875, + "logps/chosen": -326.0, + "logps/rejected": -768.0, + "loss": 0.70977783203125, + "memory(GiB)": 43.66, + "nll_loss": 0.458984375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5, + "rewards/margins": 18.375, + "rewards/rejected": -2.859375, + "step": 85, + "train_speed(iter/s)": 0.351477 + }, + { + "epoch": 1.8, + "grad_norm": 0.5774391096810936, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": 2.15625, + "logits/rejected": 1.515625, + "logps/chosen": -290.0, + "logps/rejected": -520.0, + "loss": 0.5434326171875, + "memory(GiB)": 43.66, + "nll_loss": 0.56640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.75, + "rewards/margins": 14.25, + "rewards/rejected": 0.5390625, + "step": 90, + "train_speed(iter/s)": 0.351018 + }, + { + "epoch": 1.9, + "grad_norm": 0.46335079012099145, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": 1.65625, + "logits/rejected": 1.1640625, + "logps/chosen": -366.0, + "logps/rejected": -620.0, + "loss": 0.60682373046875, + "memory(GiB)": 43.66, + "nll_loss": 0.5, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.75, + "rewards/margins": 15.9375, + "rewards/rejected": 0.7890625, + "step": 95, + "train_speed(iter/s)": 0.351082 + }, + { + "epoch": 2.0, + "grad_norm": 0.44055183589508673, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": 0.59765625, + "logits/rejected": 0.1025390625, + "logps/chosen": -276.0, + "logps/rejected": -360.0, + "loss": 0.51856689453125, + "memory(GiB)": 43.66, + "nll_loss": 0.470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.6875, + "rewards/margins": 14.0, + "rewards/rejected": 1.65625, + "step": 100, + "train_speed(iter/s)": 0.350869 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -0.63671875, + "eval_logits/rejected": 0.66796875, + "eval_logps/chosen": -6.84375, + "eval_logps/rejected": -280.0, + "eval_loss": 0.50390625, + "eval_nll_loss": 0.298828125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 14.125, + "eval_rewards/rejected": -4.0, + "eval_runtime": 1.3254, + "eval_samples_per_second": 3.018, + "eval_steps_per_second": 0.754, + "step": 100 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 87421997940736.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/training_args.bin b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..475462e40417630252a03e5a9352b5e11f000ed1 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:647dcffabf1a4c913f292b185978f2f179c4819a3df809536d284c803d9d97fe +size 8888 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/zero_to_fp32.py b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-100/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/README.md b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/README.md new file mode 100644 index 0000000000000000000000000000000000000000..525e0a2a5243ae77491e227cbf74fa8ff0a43579 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-7b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/adapter_config.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..454fa4ee7fc180fe11671ee7f6c90d0ba44328a6 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "gate_proj", + "up_proj", + "o_proj", + "v_proj", + "down_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/adapter_model.safetensors b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ab458a95ae5bc428f7746d9430f26f6f62644b51 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:135d98093b81f8bf83312d668426eab319600bddec71471a7ca11d94a55dbbb6 +size 40422208 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/additional_config.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/args.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/args.json new file mode 100644 index 0000000000000000000000000000000000000000..4b016b97ef2d4589dae9d4623f25925cb829d241 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-7b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-7b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a50edab28f42a7114f751d767310e2b991a56aa --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c710e1ceff8ba961e1bcab1b8ce67a36cfef9266dc48020f46838d8b822f082 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..00a0c2e063fe5491691509ca74fc6abcae312f89 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7db48f1c874bda700ca6e86470dd1cd7b22121acf556a7bbf7dc8181a03bd1a4 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1c7dc5175eb7309ab532f4ae06b31cdab15e06ef --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38feb17bf4adcb11c430e2cddadb446ccdc74b10ea0a87a1a99d9205a8a15cdb +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..58afd0d0fa44e43e6e6f6ac070add71bf5613292 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:837a03af3318c9b30d2c518c834b72560440302a9bd612772e10522de73c70b4 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..38ffd8e7504af1797122b239f7606d801c9915ee --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfff9824741458d7089e973fdaca77f33729b53800202f9e97c3905067727026 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2281b8ae2def6ca53380a4293ca870440175f3d4 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2d9199d156cf2e7a344ded7b2bc123afba0e84d72b0abf65514709ac6326222 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ee90f91a238308dff7f409995a6dba1442b9db8d --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19c5d1762e690de55b841f6212ac76c6728b1caefa444df087e2f45b084da70d +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5fa6ed1f2dcaf9e922a6cb8115d356b98a6e4bf1 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6259803f12d6c3d7b7f754107563af7871e2dfaf2873113af55b16b051e6e96e +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ac82b9a9b8eadbe29bdc12a7d92451c9a2fb18c8 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20dcc0b86f2d1739793151b52a4b97f09e70c26e486b5936511397a728a7887a +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef8decee60871f6860f1dbb6e05d3c103e4ba7d5 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c7534be64c970c699e8d4bd5337ccb631fba25a68950bc22c5d1036d267e7f6 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..70d232ec4170539c59d68c2141fd4dbad1aeb3f6 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8627526f5fa6011c6a6c823e2ace6ddcdca4b22853b211ad7f33fa98d53c4bce +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7e76a2ea58041e7c448920e82e537febc562728e --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73fa122683ae15695a0b367f8b1f4a2c76e093d89c3d9a73af882467f7c1acb5 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e86bb1f6bca949d8b9256364b07d0fedf4e88749 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b39cc17bb69285d81d9d78ffa52c0861d6b75ab373a320853c5bf621d9533b63 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..751a5363fc8c3a4bf60c80449caa90eaba1b4102 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d123e9b441fef58105f858176ba850ca2aab3efb06a693c0c5edf7ae897160c +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a13ec4c5bfca47a9653e87745ef9fbb52c0eb36a --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4534309530b5d06d7568ada1a0f43c78095f7a53357308f7823227170f4c713c +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c3f88f18d041fa82222c5705d34e8e2b5e4ac8a2 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/global_step120/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bf32d433c26e2ec57fdb70ba88389192c3bfd58eaee9b42d06d4beb5b6c1e58 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/latest b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/latest new file mode 100644 index 0000000000000000000000000000000000000000..0e13e0563ec45a863d519305a1251d3e72b9e3e4 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/latest @@ -0,0 +1 @@ +global_step120 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/rng_state_0.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ee83ae5e323e0bb676daf05f7f41b7951b49c7af --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae9162e03c562553a5d9d13120f544d3c47ea71bb39aa44e18253675e17ed4a4 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/rng_state_1.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..0cd0edf74beb406ae74d27fac689e74cc1a7d12b --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4809456871b3a40c8db7e0926a9db11b01149a1d483fb29b16fc69dabaf36c6f +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/rng_state_2.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..378e4e23e02084387cef58f5bfa08ef5b23ef1b3 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bb6bcf25ff148b74eea7dd4895fc42e9433538fff5d75f0d2ae6cb0c2fdadf0 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/rng_state_3.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..c9d23b00a6e62ab23a83b688e4077471f0501ba0 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f00ea04cd1a52c539d9cc948ac8a04676d6b99702acd09149565f781806f63f +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/rng_state_4.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..cf6105fec105f5636599de6b5ea414adc300ed30 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5571fb2fc1b413792b01ac691c759786855573992bab1d14875faccdaf8c881e +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/rng_state_5.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..983c7580e17a958602e3218e885e88e85d4ed9a0 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59019ba23ead9c15851cb4349397254458ce50ea3c2987090404f4f3842c6d8f +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/rng_state_6.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..1f87fedb0a1eac5d251eeb1e7cf58190877f6b60 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45fdffda57fda4a555da7a5de6fc6ec7324e0dae048b92519af6c4f6a1bc7412 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/rng_state_7.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..d32d0d7a4ca68837a8e91f7101758f2f48116bde --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62fb2c13e63aba83c4505fae1639f79a33853d8f1bebe20cecb73bf53c8e7c46 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/scheduler.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..056328cf4dbfbdfaf5b7ffa668b29852f77a3798 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8b76da7ccfd8d1a286433da6127628e0c6a1565950b2dea51fe5864ad3e6545 +size 1064 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/trainer_state.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..59ded8c7b44a5d55ad5f6854c357efacbf34041b --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/trainer_state.json @@ -0,0 +1,585 @@ +{ + "best_metric": 0.49755859, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120", + "epoch": 2.4, + "eval_steps": 20, + "global_step": 120, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 9.15866064284304, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": 2.390625, + "logits/rejected": -2.546875, + "logps/chosen": -520.0, + "logps/rejected": -142.0, + "loss": 2.4697265625, + "memory(GiB)": 8.4, + "nll_loss": 0.51171875, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.126463 + }, + { + "epoch": 0.1, + "grad_norm": 26.585759281524314, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 0.1533203125, + "logits/rejected": -0.8671875, + "logps/chosen": -728.0, + "logps/rejected": -304.0, + "loss": 2.3878173828125, + "memory(GiB)": 12.04, + "nll_loss": 1.1484375, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.125, + "rewards/margins": -0.0250244140625, + "rewards/rejected": 0.150390625, + "step": 5, + "train_speed(iter/s)": 0.276117 + }, + { + "epoch": 0.2, + "grad_norm": 8.664467296577401, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.419921875, + "logits/rejected": 1.2890625, + "logps/chosen": -644.0, + "logps/rejected": -572.0, + "loss": 2.18505859375, + "memory(GiB)": 17.87, + "nll_loss": 1.4921875, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.1103515625, + "rewards/margins": 0.08984375, + "rewards/rejected": 0.0201416015625, + "step": 10, + "train_speed(iter/s)": 0.313547 + }, + { + "epoch": 0.3, + "grad_norm": 6.325430646213694, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.7578125, + "logits/rejected": 1.8203125, + "logps/chosen": -924.0, + "logps/rejected": -1552.0, + "loss": 1.76845703125, + "memory(GiB)": 32.02, + "nll_loss": 1.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.75, + "rewards/margins": 2.390625, + "rewards/rejected": 0.359375, + "step": 15, + "train_speed(iter/s)": 0.318522 + }, + { + "epoch": 0.4, + "grad_norm": 2.351010309283972, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": 0.5859375, + "logits/rejected": -0.4375, + "logps/chosen": -432.0, + "logps/rejected": -324.0, + "loss": 1.172265625, + "memory(GiB)": 32.02, + "nll_loss": 0.6640625, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 4.65625, + "rewards/margins": 1.5546875, + "rewards/rejected": 3.09375, + "step": 20, + "train_speed(iter/s)": 0.331194 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.3125, + "eval_logits/rejected": 0.86328125, + "eval_logps/chosen": -14.3125, + "eval_logps/rejected": -182.0, + "eval_loss": 0.6728515625, + "eval_nll_loss": 0.62109375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.375, + "eval_rewards/margins": 3.5625, + "eval_rewards/rejected": 5.8125, + "eval_runtime": 1.2862, + "eval_samples_per_second": 3.11, + "eval_steps_per_second": 0.778, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.5984307422346672, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": 0.255859375, + "logits/rejected": -0.0390625, + "logps/chosen": -326.0, + "logps/rejected": -420.0, + "loss": 0.8047607421875, + "memory(GiB)": 32.02, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.125, + "rewards/margins": 5.40625, + "rewards/rejected": 3.6875, + "step": 25, + "train_speed(iter/s)": 0.330795 + }, + { + "epoch": 0.6, + "grad_norm": 0.5223734060204478, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -0.21875, + "logits/rejected": 1.546875, + "logps/chosen": -400.0, + "logps/rejected": -656.0, + "loss": 0.6901611328125, + "memory(GiB)": 32.02, + "nll_loss": 0.78125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.5625, + "rewards/margins": 6.78125, + "rewards/rejected": 3.765625, + "step": 30, + "train_speed(iter/s)": 0.339407 + }, + { + "epoch": 0.7, + "grad_norm": 1.0839186250481967, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": 1.5625, + "logits/rejected": -1.578125, + "logps/chosen": -592.0, + "logps/rejected": -322.0, + "loss": 0.7055419921875, + "memory(GiB)": 32.02, + "nll_loss": 0.96875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.75, + "rewards/margins": 9.625, + "rewards/rejected": 3.125, + "step": 35, + "train_speed(iter/s)": 0.346622 + }, + { + "epoch": 0.8, + "grad_norm": 13.025539300060226, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": 0.08740234375, + "logits/rejected": 0.71484375, + "logps/chosen": -225.0, + "logps/rejected": -322.0, + "loss": 0.6363037109375, + "memory(GiB)": 32.02, + "nll_loss": 0.56640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.0625, + "rewards/margins": 7.53125, + "rewards/rejected": 3.546875, + "step": 40, + "train_speed(iter/s)": 0.345794 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -0.8046875, + "eval_logits/rejected": 0.9375, + "eval_logps/chosen": -7.1875, + "eval_logps/rejected": -212.0, + "eval_loss": 0.5283203125, + "eval_nll_loss": 0.3125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 7.3125, + "eval_rewards/rejected": 2.796875, + "eval_runtime": 1.2905, + "eval_samples_per_second": 3.1, + "eval_steps_per_second": 0.775, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.49724108967226127, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": 2.890625, + "logits/rejected": 0.002349853515625, + "logps/chosen": -426.0, + "logps/rejected": -430.0, + "loss": 0.5437744140625, + "memory(GiB)": 32.02, + "nll_loss": 0.470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.0625, + "rewards/margins": 10.6875, + "rewards/rejected": 3.390625, + "step": 45, + "train_speed(iter/s)": 0.342553 + }, + { + "epoch": 1.0, + "grad_norm": 1.4155859018634052, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": 0.2451171875, + "logits/rejected": 1.75, + "logps/chosen": -804.0, + "logps/rejected": -1480.0, + "loss": 0.57803955078125, + "memory(GiB)": 32.02, + "nll_loss": 0.6953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.25, + "rewards/margins": 9.125, + "rewards/rejected": 7.125, + "step": 50, + "train_speed(iter/s)": 0.346029 + }, + { + "epoch": 1.1, + "grad_norm": 0.5694275670231188, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": 1.8359375, + "logits/rejected": 0.703125, + "logps/chosen": -386.0, + "logps/rejected": -612.0, + "loss": 0.6138671875, + "memory(GiB)": 43.66, + "nll_loss": 0.546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.75, + "rewards/margins": 12.25, + "rewards/rejected": 3.53125, + "step": 55, + "train_speed(iter/s)": 0.347089 + }, + { + "epoch": 1.2, + "grad_norm": 0.3472486708219598, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": 1.65625, + "logits/rejected": 2.15625, + "logps/chosen": -233.0, + "logps/rejected": -520.0, + "loss": 0.5795654296875, + "memory(GiB)": 43.66, + "nll_loss": 0.396484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.375, + "rewards/margins": 12.1875, + "rewards/rejected": 3.140625, + "step": 60, + "train_speed(iter/s)": 0.349928 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -0.23828125, + "eval_logits/rejected": 1.703125, + "eval_logps/chosen": -7.3125, + "eval_logps/rejected": -223.0, + "eval_loss": 0.51806640625, + "eval_nll_loss": 0.31640625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 8.4375, + "eval_rewards/rejected": 1.703125, + "eval_runtime": 1.2918, + "eval_samples_per_second": 3.096, + "eval_steps_per_second": 0.774, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.48830819519223906, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": 0.97265625, + "logits/rejected": 1.9921875, + "logps/chosen": -346.0, + "logps/rejected": -844.0, + "loss": 0.6193359375, + "memory(GiB)": 43.66, + "nll_loss": 0.640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5625, + "rewards/margins": 11.875, + "rewards/rejected": 3.703125, + "step": 65, + "train_speed(iter/s)": 0.34619 + }, + { + "epoch": 1.4, + "grad_norm": 0.3461001279478362, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -0.6875, + "logits/rejected": 2.28125, + "logps/chosen": -88.0, + "logps/rejected": -1120.0, + "loss": 0.5681396484375, + "memory(GiB)": 43.66, + "nll_loss": 0.84375, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 11.5625, + "rewards/margins": 9.125, + "rewards/rejected": 2.484375, + "step": 70, + "train_speed(iter/s)": 0.349389 + }, + { + "epoch": 1.5, + "grad_norm": 0.49738864814975586, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": 2.515625, + "logits/rejected": 0.263671875, + "logps/chosen": -462.0, + "logps/rejected": -434.0, + "loss": 0.517205810546875, + "memory(GiB)": 43.66, + "nll_loss": 0.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.625, + "rewards/margins": 16.375, + "rewards/rejected": 2.28125, + "step": 75, + "train_speed(iter/s)": 0.351345 + }, + { + "epoch": 1.6, + "grad_norm": 0.3498527388541882, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": 3.5625, + "logits/rejected": -1.296875, + "logps/chosen": -580.0, + "logps/rejected": -132.0, + "loss": 0.5629638671875, + "memory(GiB)": 43.66, + "nll_loss": 0.703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.5, + "rewards/margins": 15.6875, + "rewards/rejected": 5.84375, + "step": 80, + "train_speed(iter/s)": 0.353922 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -0.68359375, + "eval_logits/rejected": 0.3125, + "eval_logps/chosen": -6.3125, + "eval_logps/rejected": -306.0, + "eval_loss": 0.5009765625, + "eval_nll_loss": 0.2734375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.1875, + "eval_rewards/margins": 16.75, + "eval_rewards/rejected": -6.59375, + "eval_runtime": 1.3254, + "eval_samples_per_second": 3.018, + "eval_steps_per_second": 0.754, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.6649877164961882, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": 0.72265625, + "logits/rejected": 0.388671875, + "logps/chosen": -326.0, + "logps/rejected": -768.0, + "loss": 0.70977783203125, + "memory(GiB)": 43.66, + "nll_loss": 0.458984375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5, + "rewards/margins": 18.375, + "rewards/rejected": -2.859375, + "step": 85, + "train_speed(iter/s)": 0.351477 + }, + { + "epoch": 1.8, + "grad_norm": 0.5774391096810936, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": 2.15625, + "logits/rejected": 1.515625, + "logps/chosen": -290.0, + "logps/rejected": -520.0, + "loss": 0.5434326171875, + "memory(GiB)": 43.66, + "nll_loss": 0.56640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.75, + "rewards/margins": 14.25, + "rewards/rejected": 0.5390625, + "step": 90, + "train_speed(iter/s)": 0.351018 + }, + { + "epoch": 1.9, + "grad_norm": 0.46335079012099145, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": 1.65625, + "logits/rejected": 1.1640625, + "logps/chosen": -366.0, + "logps/rejected": -620.0, + "loss": 0.60682373046875, + "memory(GiB)": 43.66, + "nll_loss": 0.5, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.75, + "rewards/margins": 15.9375, + "rewards/rejected": 0.7890625, + "step": 95, + "train_speed(iter/s)": 0.351082 + }, + { + "epoch": 2.0, + "grad_norm": 0.44055183589508673, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": 0.59765625, + "logits/rejected": 0.1025390625, + "logps/chosen": -276.0, + "logps/rejected": -360.0, + "loss": 0.51856689453125, + "memory(GiB)": 43.66, + "nll_loss": 0.470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.6875, + "rewards/margins": 14.0, + "rewards/rejected": 1.65625, + "step": 100, + "train_speed(iter/s)": 0.350869 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -0.63671875, + "eval_logits/rejected": 0.66796875, + "eval_logps/chosen": -6.84375, + "eval_logps/rejected": -280.0, + "eval_loss": 0.50390625, + "eval_nll_loss": 0.298828125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 14.125, + "eval_rewards/rejected": -4.0, + "eval_runtime": 1.3254, + "eval_samples_per_second": 3.018, + "eval_steps_per_second": 0.754, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.43319830084665456, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": 3.5, + "logits/rejected": 0.185546875, + "logps/chosen": -442.0, + "logps/rejected": -580.0, + "loss": 0.540869140625, + "memory(GiB)": 43.66, + "nll_loss": 0.60546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.375, + "rewards/margins": 16.125, + "rewards/rejected": 4.25, + "step": 105, + "train_speed(iter/s)": 0.34793 + }, + { + "epoch": 2.2, + "grad_norm": 0.5087186205209226, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": 0.41015625, + "logits/rejected": 1.9375, + "logps/chosen": -688.0, + "logps/rejected": -1280.0, + "loss": 0.5629150390625, + "memory(GiB)": 43.66, + "nll_loss": 0.69140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.875, + "rewards/margins": 14.5, + "rewards/rejected": 6.34375, + "step": 110, + "train_speed(iter/s)": 0.34833 + }, + { + "epoch": 2.3, + "grad_norm": 0.30352973245628134, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": 1.546875, + "logits/rejected": 1.0625, + "logps/chosen": -568.0, + "logps/rejected": -704.0, + "loss": 0.596630859375, + "memory(GiB)": 43.66, + "nll_loss": 0.5390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.0, + "rewards/margins": 15.4375, + "rewards/rejected": 5.5, + "step": 115, + "train_speed(iter/s)": 0.349547 + }, + { + "epoch": 2.4, + "grad_norm": 0.5099680534446042, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": 2.40625, + "logits/rejected": 0.002349853515625, + "logps/chosen": -368.0, + "logps/rejected": -316.0, + "loss": 0.463055419921875, + "memory(GiB)": 43.66, + "nll_loss": 0.37109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.625, + "rewards/margins": 13.1875, + "rewards/rejected": 3.484375, + "step": 120, + "train_speed(iter/s)": 0.35126 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -0.96484375, + "eval_logits/rejected": 0.416015625, + "eval_logps/chosen": -6.4375, + "eval_logps/rejected": -280.0, + "eval_loss": 0.49755859375, + "eval_nll_loss": 0.279296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.1875, + "eval_rewards/margins": 14.1875, + "eval_rewards/rejected": -4.0, + "eval_runtime": 1.3007, + "eval_samples_per_second": 3.075, + "eval_steps_per_second": 0.769, + "step": 120 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 104748905005056.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/training_args.bin b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..475462e40417630252a03e5a9352b5e11f000ed1 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:647dcffabf1a4c913f292b185978f2f179c4819a3df809536d284c803d9d97fe +size 8888 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/zero_to_fp32.py b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/README.md b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/README.md new file mode 100644 index 0000000000000000000000000000000000000000..525e0a2a5243ae77491e227cbf74fa8ff0a43579 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-7b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/adapter_config.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..454fa4ee7fc180fe11671ee7f6c90d0ba44328a6 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "gate_proj", + "up_proj", + "o_proj", + "v_proj", + "down_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/adapter_model.safetensors b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e31cc26f6542c9b33f5129bb9976baa48b5d0117 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be4f1448ac0d210318296bda6a4eddb9148881fe40c034c679ed2dd819e55f87 +size 40422208 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/additional_config.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/args.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/args.json new file mode 100644 index 0000000000000000000000000000000000000000..4b016b97ef2d4589dae9d4623f25925cb829d241 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-7b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-7b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..05127dbd1b4f9f3b5bdd8928d8ec369810307049 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5bc16c1d1abffbd7e396f37eac4e456d2bb26844e3e5def4baa0d02b371f636 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..30e319300b59283470f893b75193e6df2dff8082 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:011e846b6932bcdd9dd70390c1b65101bfcd7442f0e96155cb27e633d1251333 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4c3dbee89de50d7308252a6b3b76993bb034ce4e --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c24f87940a1b3cdd8856d952b049525366e5ffa9ddcc64a780d0ac7342ec2b8b +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8d5af88107b497f8186fe5980f132c13cfd3788c --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47f7ab7c8596cb68846dfd0c4c6c7ebaa3f026dc4df8be8e50e33036b075704d +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c755e8f2066350b809ee71476a7b0a1247b422d3 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1023dca2fc34e681f4190573337ebb3a17bf7f70a025c02aaf02750610ae49bc +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..88aa6cace6b744910d7c3ef677767bb5e0d9bf1f --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3651aef4ee68a94c9527d7f23fbc0f93fae8a965029825b541b559cfac6e73b5 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e0e90bcdb9976326d4cb825c24387896f8e6ee62 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a27b6e9478ce5846a424fc537e7c197502d0d44121fb1047bbea3ccc5a197a08 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..91d84e13dae6eba9dacc7958163cebd190a24027 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24369445f701cad440214640ccebf38634794aa1f3fca5c04586463ba8698b23 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..506d38c357a83176094fa30423efafb4c1e311c4 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b6b77d678e47270ff16991cb8c7d582a8fa78a215f84d3fe313bc8206e44794 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..51de01714ef71e42f436ad763ab6f6e6d48b6f5e --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc1129756bbfabe0171869d0f5c8ca33815b027bb01eaf0697f85d9dbf8e172e +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4eb6a40028200215188cf1856db7e0b41fc06adb --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4855d030ef5fbaa3f88e197dce5808d5f2cf54fafb76dd033f233490641508a7 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..85e231b462381cfec6a5dacf1c42b0d15260c456 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ff4ced74c72e92b20f8b233a3687d37081b96ec199ab8bc5fe909120eaf6539 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a26438151852ac800880f2e2930269a1404badf1 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2aee2711f3afd46a9a64cb06833cea6809b00e2d650715771cbffa25f3a86f47 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..77cdb7abf879a9334236c95abef1d604ae040a7b --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c4a88acddec1ca604461782e41f6dc805eb00369a23ba90f4b33908a0aeb54d +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d45c62d12cbf969a3755ac0f096ba8b416fe79a2 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bd4c70624d30070294f4d1046be8274dcf094b20b8208840ec0df5892a80518 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4b2e694a863f5e2d99762a4951749e4996ce8064 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/global_step140/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf22b5447e97287db2369c36f70b2a6aea8c96f7a590db53f2b221cb46fa9938 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/latest b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/latest new file mode 100644 index 0000000000000000000000000000000000000000..fd2b9aef86529798137c2868d556e873a23c785c --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/latest @@ -0,0 +1 @@ +global_step140 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/rng_state_0.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..d4ade713ef57d0535c32a9251c786bc57de03d06 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb1165242405b17b3d6a8186ae61b13dcb1faa5a54320bebd74ef8d71b964bf7 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/rng_state_1.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d91c511b147b4dd17988903c57adcefb6c1f20b0 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:562c262916c9997ec644c42fed9655ab28706b74fca20290ca921c4761d6a4b0 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/rng_state_2.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..f71e829b3e3570a540263d07783c4e906a78a803 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8d40f8118f513299624ded0a9bcf09778b961635615090409394d4f96f928f6 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/rng_state_3.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..be7f0176676a7c526bb10cbb336b2afa89d8841c --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4391f924238a4cb855c4cbdc6d1a14954f785431c75997d05c7a4ee6615dae7 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/rng_state_4.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..8dd1a877dd1f03799067fd08739e82b9f2cd2ad3 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be7b19bb9543a16bf9f4cd96466ac581436f63070f5815f3a7ba57980608994f +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/rng_state_5.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..dcf1b720014f72a27a09ab9ef8570430a8e3c96d --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97da4a1ede0a3e0f96411cacd5bfdf84d9355198f7aadc9bcb8be41122043f63 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/rng_state_6.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..2b58cbeed7b25ef61c6439aced60df473cbaf6d4 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:544cb6421b975bd5d2b2360a4e666003794e6197ae654d2ad963cd6572a86ede +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/rng_state_7.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..36a7dcefe0e0264868d40586546699306878a454 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8d6eb32a23f3bef6262bbcb2eda724b2fd6f5e579969aa27c71a5971331722b +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/scheduler.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1f6e613ad02e1482b1eef52ff51329fe67d4fceb --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a9c57c64e42f5d7ec5b6fd8bf14122cd4f49a4ae907dcde9c057b79cc82e639 +size 1064 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/trainer_state.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3242735b68c4662c03417ed440c7cc67bf77d931 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/trainer_state.json @@ -0,0 +1,674 @@ +{ + "best_metric": 0.49755859, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-120", + "epoch": 2.8, + "eval_steps": 20, + "global_step": 140, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 9.15866064284304, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": 2.390625, + "logits/rejected": -2.546875, + "logps/chosen": -520.0, + "logps/rejected": -142.0, + "loss": 2.4697265625, + "memory(GiB)": 8.4, + "nll_loss": 0.51171875, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.126463 + }, + { + "epoch": 0.1, + "grad_norm": 26.585759281524314, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 0.1533203125, + "logits/rejected": -0.8671875, + "logps/chosen": -728.0, + "logps/rejected": -304.0, + "loss": 2.3878173828125, + "memory(GiB)": 12.04, + "nll_loss": 1.1484375, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.125, + "rewards/margins": -0.0250244140625, + "rewards/rejected": 0.150390625, + "step": 5, + "train_speed(iter/s)": 0.276117 + }, + { + "epoch": 0.2, + "grad_norm": 8.664467296577401, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.419921875, + "logits/rejected": 1.2890625, + "logps/chosen": -644.0, + "logps/rejected": -572.0, + "loss": 2.18505859375, + "memory(GiB)": 17.87, + "nll_loss": 1.4921875, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.1103515625, + "rewards/margins": 0.08984375, + "rewards/rejected": 0.0201416015625, + "step": 10, + "train_speed(iter/s)": 0.313547 + }, + { + "epoch": 0.3, + "grad_norm": 6.325430646213694, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.7578125, + "logits/rejected": 1.8203125, + "logps/chosen": -924.0, + "logps/rejected": -1552.0, + "loss": 1.76845703125, + "memory(GiB)": 32.02, + "nll_loss": 1.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.75, + "rewards/margins": 2.390625, + "rewards/rejected": 0.359375, + "step": 15, + "train_speed(iter/s)": 0.318522 + }, + { + "epoch": 0.4, + "grad_norm": 2.351010309283972, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": 0.5859375, + "logits/rejected": -0.4375, + "logps/chosen": -432.0, + "logps/rejected": -324.0, + "loss": 1.172265625, + "memory(GiB)": 32.02, + "nll_loss": 0.6640625, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 4.65625, + "rewards/margins": 1.5546875, + "rewards/rejected": 3.09375, + "step": 20, + "train_speed(iter/s)": 0.331194 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.3125, + "eval_logits/rejected": 0.86328125, + "eval_logps/chosen": -14.3125, + "eval_logps/rejected": -182.0, + "eval_loss": 0.6728515625, + "eval_nll_loss": 0.62109375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.375, + "eval_rewards/margins": 3.5625, + "eval_rewards/rejected": 5.8125, + "eval_runtime": 1.2862, + "eval_samples_per_second": 3.11, + "eval_steps_per_second": 0.778, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.5984307422346672, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": 0.255859375, + "logits/rejected": -0.0390625, + "logps/chosen": -326.0, + "logps/rejected": -420.0, + "loss": 0.8047607421875, + "memory(GiB)": 32.02, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.125, + "rewards/margins": 5.40625, + "rewards/rejected": 3.6875, + "step": 25, + "train_speed(iter/s)": 0.330795 + }, + { + "epoch": 0.6, + "grad_norm": 0.5223734060204478, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -0.21875, + "logits/rejected": 1.546875, + "logps/chosen": -400.0, + "logps/rejected": -656.0, + "loss": 0.6901611328125, + "memory(GiB)": 32.02, + "nll_loss": 0.78125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.5625, + "rewards/margins": 6.78125, + "rewards/rejected": 3.765625, + "step": 30, + "train_speed(iter/s)": 0.339407 + }, + { + "epoch": 0.7, + "grad_norm": 1.0839186250481967, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": 1.5625, + "logits/rejected": -1.578125, + "logps/chosen": -592.0, + "logps/rejected": -322.0, + "loss": 0.7055419921875, + "memory(GiB)": 32.02, + "nll_loss": 0.96875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.75, + "rewards/margins": 9.625, + "rewards/rejected": 3.125, + "step": 35, + "train_speed(iter/s)": 0.346622 + }, + { + "epoch": 0.8, + "grad_norm": 13.025539300060226, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": 0.08740234375, + "logits/rejected": 0.71484375, + "logps/chosen": -225.0, + "logps/rejected": -322.0, + "loss": 0.6363037109375, + "memory(GiB)": 32.02, + "nll_loss": 0.56640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.0625, + "rewards/margins": 7.53125, + "rewards/rejected": 3.546875, + "step": 40, + "train_speed(iter/s)": 0.345794 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -0.8046875, + "eval_logits/rejected": 0.9375, + "eval_logps/chosen": -7.1875, + "eval_logps/rejected": -212.0, + "eval_loss": 0.5283203125, + "eval_nll_loss": 0.3125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 7.3125, + "eval_rewards/rejected": 2.796875, + "eval_runtime": 1.2905, + "eval_samples_per_second": 3.1, + "eval_steps_per_second": 0.775, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.49724108967226127, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": 2.890625, + "logits/rejected": 0.002349853515625, + "logps/chosen": -426.0, + "logps/rejected": -430.0, + "loss": 0.5437744140625, + "memory(GiB)": 32.02, + "nll_loss": 0.470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.0625, + "rewards/margins": 10.6875, + "rewards/rejected": 3.390625, + "step": 45, + "train_speed(iter/s)": 0.342553 + }, + { + "epoch": 1.0, + "grad_norm": 1.4155859018634052, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": 0.2451171875, + "logits/rejected": 1.75, + "logps/chosen": -804.0, + "logps/rejected": -1480.0, + "loss": 0.57803955078125, + "memory(GiB)": 32.02, + "nll_loss": 0.6953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.25, + "rewards/margins": 9.125, + "rewards/rejected": 7.125, + "step": 50, + "train_speed(iter/s)": 0.346029 + }, + { + "epoch": 1.1, + "grad_norm": 0.5694275670231188, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": 1.8359375, + "logits/rejected": 0.703125, + "logps/chosen": -386.0, + "logps/rejected": -612.0, + "loss": 0.6138671875, + "memory(GiB)": 43.66, + "nll_loss": 0.546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.75, + "rewards/margins": 12.25, + "rewards/rejected": 3.53125, + "step": 55, + "train_speed(iter/s)": 0.347089 + }, + { + "epoch": 1.2, + "grad_norm": 0.3472486708219598, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": 1.65625, + "logits/rejected": 2.15625, + "logps/chosen": -233.0, + "logps/rejected": -520.0, + "loss": 0.5795654296875, + "memory(GiB)": 43.66, + "nll_loss": 0.396484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.375, + "rewards/margins": 12.1875, + "rewards/rejected": 3.140625, + "step": 60, + "train_speed(iter/s)": 0.349928 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -0.23828125, + "eval_logits/rejected": 1.703125, + "eval_logps/chosen": -7.3125, + "eval_logps/rejected": -223.0, + "eval_loss": 0.51806640625, + "eval_nll_loss": 0.31640625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 8.4375, + "eval_rewards/rejected": 1.703125, + "eval_runtime": 1.2918, + "eval_samples_per_second": 3.096, + "eval_steps_per_second": 0.774, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.48830819519223906, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": 0.97265625, + "logits/rejected": 1.9921875, + "logps/chosen": -346.0, + "logps/rejected": -844.0, + "loss": 0.6193359375, + "memory(GiB)": 43.66, + "nll_loss": 0.640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5625, + "rewards/margins": 11.875, + "rewards/rejected": 3.703125, + "step": 65, + "train_speed(iter/s)": 0.34619 + }, + { + "epoch": 1.4, + "grad_norm": 0.3461001279478362, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -0.6875, + "logits/rejected": 2.28125, + "logps/chosen": -88.0, + "logps/rejected": -1120.0, + "loss": 0.5681396484375, + "memory(GiB)": 43.66, + "nll_loss": 0.84375, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 11.5625, + "rewards/margins": 9.125, + "rewards/rejected": 2.484375, + "step": 70, + "train_speed(iter/s)": 0.349389 + }, + { + "epoch": 1.5, + "grad_norm": 0.49738864814975586, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": 2.515625, + "logits/rejected": 0.263671875, + "logps/chosen": -462.0, + "logps/rejected": -434.0, + "loss": 0.517205810546875, + "memory(GiB)": 43.66, + "nll_loss": 0.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.625, + "rewards/margins": 16.375, + "rewards/rejected": 2.28125, + "step": 75, + "train_speed(iter/s)": 0.351345 + }, + { + "epoch": 1.6, + "grad_norm": 0.3498527388541882, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": 3.5625, + "logits/rejected": -1.296875, + "logps/chosen": -580.0, + "logps/rejected": -132.0, + "loss": 0.5629638671875, + "memory(GiB)": 43.66, + "nll_loss": 0.703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.5, + "rewards/margins": 15.6875, + "rewards/rejected": 5.84375, + "step": 80, + "train_speed(iter/s)": 0.353922 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -0.68359375, + "eval_logits/rejected": 0.3125, + "eval_logps/chosen": -6.3125, + "eval_logps/rejected": -306.0, + "eval_loss": 0.5009765625, + "eval_nll_loss": 0.2734375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.1875, + "eval_rewards/margins": 16.75, + "eval_rewards/rejected": -6.59375, + "eval_runtime": 1.3254, + "eval_samples_per_second": 3.018, + "eval_steps_per_second": 0.754, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.6649877164961882, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": 0.72265625, + "logits/rejected": 0.388671875, + "logps/chosen": -326.0, + "logps/rejected": -768.0, + "loss": 0.70977783203125, + "memory(GiB)": 43.66, + "nll_loss": 0.458984375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5, + "rewards/margins": 18.375, + "rewards/rejected": -2.859375, + "step": 85, + "train_speed(iter/s)": 0.351477 + }, + { + "epoch": 1.8, + "grad_norm": 0.5774391096810936, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": 2.15625, + "logits/rejected": 1.515625, + "logps/chosen": -290.0, + "logps/rejected": -520.0, + "loss": 0.5434326171875, + "memory(GiB)": 43.66, + "nll_loss": 0.56640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.75, + "rewards/margins": 14.25, + "rewards/rejected": 0.5390625, + "step": 90, + "train_speed(iter/s)": 0.351018 + }, + { + "epoch": 1.9, + "grad_norm": 0.46335079012099145, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": 1.65625, + "logits/rejected": 1.1640625, + "logps/chosen": -366.0, + "logps/rejected": -620.0, + "loss": 0.60682373046875, + "memory(GiB)": 43.66, + "nll_loss": 0.5, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.75, + "rewards/margins": 15.9375, + "rewards/rejected": 0.7890625, + "step": 95, + "train_speed(iter/s)": 0.351082 + }, + { + "epoch": 2.0, + "grad_norm": 0.44055183589508673, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": 0.59765625, + "logits/rejected": 0.1025390625, + "logps/chosen": -276.0, + "logps/rejected": -360.0, + "loss": 0.51856689453125, + "memory(GiB)": 43.66, + "nll_loss": 0.470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.6875, + "rewards/margins": 14.0, + "rewards/rejected": 1.65625, + "step": 100, + "train_speed(iter/s)": 0.350869 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -0.63671875, + "eval_logits/rejected": 0.66796875, + "eval_logps/chosen": -6.84375, + "eval_logps/rejected": -280.0, + "eval_loss": 0.50390625, + "eval_nll_loss": 0.298828125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 14.125, + "eval_rewards/rejected": -4.0, + "eval_runtime": 1.3254, + "eval_samples_per_second": 3.018, + "eval_steps_per_second": 0.754, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.43319830084665456, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": 3.5, + "logits/rejected": 0.185546875, + "logps/chosen": -442.0, + "logps/rejected": -580.0, + "loss": 0.540869140625, + "memory(GiB)": 43.66, + "nll_loss": 0.60546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.375, + "rewards/margins": 16.125, + "rewards/rejected": 4.25, + "step": 105, + "train_speed(iter/s)": 0.34793 + }, + { + "epoch": 2.2, + "grad_norm": 0.5087186205209226, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": 0.41015625, + "logits/rejected": 1.9375, + "logps/chosen": -688.0, + "logps/rejected": -1280.0, + "loss": 0.5629150390625, + "memory(GiB)": 43.66, + "nll_loss": 0.69140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.875, + "rewards/margins": 14.5, + "rewards/rejected": 6.34375, + "step": 110, + "train_speed(iter/s)": 0.34833 + }, + { + "epoch": 2.3, + "grad_norm": 0.30352973245628134, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": 1.546875, + "logits/rejected": 1.0625, + "logps/chosen": -568.0, + "logps/rejected": -704.0, + "loss": 0.596630859375, + "memory(GiB)": 43.66, + "nll_loss": 0.5390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.0, + "rewards/margins": 15.4375, + "rewards/rejected": 5.5, + "step": 115, + "train_speed(iter/s)": 0.349547 + }, + { + "epoch": 2.4, + "grad_norm": 0.5099680534446042, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": 2.40625, + "logits/rejected": 0.002349853515625, + "logps/chosen": -368.0, + "logps/rejected": -316.0, + "loss": 0.463055419921875, + "memory(GiB)": 43.66, + "nll_loss": 0.37109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.625, + "rewards/margins": 13.1875, + "rewards/rejected": 3.484375, + "step": 120, + "train_speed(iter/s)": 0.35126 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -0.96484375, + "eval_logits/rejected": 0.416015625, + "eval_logps/chosen": -6.4375, + "eval_logps/rejected": -280.0, + "eval_loss": 0.49755859375, + "eval_nll_loss": 0.279296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.1875, + "eval_rewards/margins": 14.1875, + "eval_rewards/rejected": -4.0, + "eval_runtime": 1.3007, + "eval_samples_per_second": 3.075, + "eval_steps_per_second": 0.769, + "step": 120 + }, + { + "epoch": 2.5, + "grad_norm": 0.6759212291558464, + "learning_rate": 5.430276272567485e-05, + "logits/chosen": -1.4140625, + "logits/rejected": 0.82421875, + "logps/chosen": -62.5, + "logps/rejected": -1288.0, + "loss": 0.5360504150390625, + "memory(GiB)": 43.66, + "nll_loss": 0.7265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.75, + "rewards/margins": 19.75, + "rewards/rejected": -7.96875, + "step": 125, + "train_speed(iter/s)": 0.350267 + }, + { + "epoch": 2.6, + "grad_norm": 0.3635118516286732, + "learning_rate": 5.0994109383253506e-05, + "logits/chosen": 2.3125, + "logits/rejected": 0.546875, + "logps/chosen": -348.0, + "logps/rejected": -314.0, + "loss": 0.4655029296875, + "memory(GiB)": 43.66, + "nll_loss": 0.484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.875, + "rewards/margins": 15.3125, + "rewards/rejected": 3.515625, + "step": 130, + "train_speed(iter/s)": 0.35164 + }, + { + "epoch": 2.7, + "grad_norm": 0.4546423377677983, + "learning_rate": 4.768109069909307e-05, + "logits/chosen": -0.375, + "logits/rejected": 0.88671875, + "logps/chosen": -157.0, + "logps/rejected": -624.0, + "loss": 0.44596214294433595, + "memory(GiB)": 43.66, + "nll_loss": 0.24609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.375, + "rewards/margins": 14.9375, + "rewards/rejected": -1.5625, + "step": 135, + "train_speed(iter/s)": 0.352504 + }, + { + "epoch": 2.8, + "grad_norm": 0.3766526906078867, + "learning_rate": 4.4378254829551396e-05, + "logits/chosen": 1.484375, + "logits/rejected": 1.1484375, + "logps/chosen": -392.0, + "logps/rejected": -708.0, + "loss": 0.4998992919921875, + "memory(GiB)": 43.66, + "nll_loss": 0.734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.625, + "rewards/margins": 17.0, + "rewards/rejected": 1.5546875, + "step": 140, + "train_speed(iter/s)": 0.353731 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -1.0546875, + "eval_logits/rejected": 0.28515625, + "eval_logps/chosen": -6.65625, + "eval_logps/rejected": -282.0, + "eval_loss": 0.49853515625, + "eval_nll_loss": 0.2890625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.1875, + "eval_rewards/margins": 14.375, + "eval_rewards/rejected": -4.1875, + "eval_runtime": 1.2976, + "eval_samples_per_second": 3.083, + "eval_steps_per_second": 0.771, + "step": 140 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 121555446530048.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/training_args.bin b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..475462e40417630252a03e5a9352b5e11f000ed1 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:647dcffabf1a4c913f292b185978f2f179c4819a3df809536d284c803d9d97fe +size 8888 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/zero_to_fp32.py b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-140/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/README.md b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/README.md new file mode 100644 index 0000000000000000000000000000000000000000..525e0a2a5243ae77491e227cbf74fa8ff0a43579 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-7b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/adapter_config.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..454fa4ee7fc180fe11671ee7f6c90d0ba44328a6 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "gate_proj", + "up_proj", + "o_proj", + "v_proj", + "down_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/adapter_model.safetensors b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c058ae4f74d544d882ddfead18b8f894f6cb5a08 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b465f0f0e267b66ab1e45cfcffc7f6ed8da550ec8a0673cf7cf188b7f6d55b4 +size 40422208 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/additional_config.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/args.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/args.json new file mode 100644 index 0000000000000000000000000000000000000000..4b016b97ef2d4589dae9d4623f25925cb829d241 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-7b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-7b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9db5527140c15aed362df28ee4f1d3ee8a81db1b --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7cbbbf82c6bd386962b5ac785c72034a6ce8c878a8cab42ca3e453b4379477b +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c0d5eaee90926f21fec3bc27aece55e22ad180b --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2313d440367e21fb641be0ff6450fcb382254424903567618a92b3f800cf2a6d +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eea631e74573a58e1e0f7613776f9fd5fd6c20be --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd482588047ead55780e18c743c436dc9fbc0366548738c132cb4d25fcf50f38 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..191815c38674bb4a7788deb98ea48cc9511c4daf --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6aa7e1ba234a6b2ab323fa2c84c46762a981d513ce1415501075ffe8867ec7db +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..97e162d3a62f2df408aeca693365868c276756a5 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:226a11b07dd82394039e2aa4b8cc49dfca500e0129888eb7e5467d59e486e45e +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..258032f61e9e639ceb8794fb56b485209e163926 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9aa96c3afbbadb4662355528d320a93a57ceee565efca935a83f6dbc6812bdc2 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..408c5c55ac1d9744f2e640e0e015153af3c4c250 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7c9d0f0374b4f350190020c72c7ae4515639a5123d708dec484d6f517e35b4e +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..57a2d4353ee235b4edbce07c763bdc9081fe6bc5 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d449c6df4b9a4a3f24dde2d5f090875c80bd2155d87e6fb9fb604e0f8175d618 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b606780b83222d4c09d3f92cbae67c69063e95b3 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d72131ee23d6ae489dac79a1416e027163415346b187b38bca78ab1610918d4e +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0928cfe3c6855a198b66ef54628ccf8bbf687210 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9aeca38a91e80cc275b7109bc66907b4b8b9500b97accb4ac3a2b9e920215c71 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..709f0a6c5d992c6dfbbda825541355734cdb77cf --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fcb5646acc6c5db1f5ca04cb4beed6b2760cddbd9e6822d5ba70a536cb659d4e +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f7d8c5877a3bfeb5bf55584d4b70364c1f7dd905 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f9fec8388828c8f3392bf610ff2e5565140958dc100a3c7586170fa08e3e94d +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0125c4a5effd3c4aca38fefca109a5acb571600d --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad0813d53bcf06ab8af66ce4f1b2c352c3b004c98d45e0d7091edc000e26bbc1 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e5730fb817c1fcc9363b4c7c138a82e28bd27e0c --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0aa05538520a0bb9708e6756f50d65d63430c0a07f2b185208ddbaca9fea9ada +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cc24631e5b676a939526d0ded7df7922e6ab7de7 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e2372b6df9bbe89f2c83f549a88fbfc15e4137bfd0d35681e3f1decf619ae75 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cebc3af76929714291a3638d09966f69b658c73a --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/global_step160/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f765ad06c0f49b57b838ff7a5fb7908ebff78f43ed434dbd42851caf65e33b87 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/latest b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/latest new file mode 100644 index 0000000000000000000000000000000000000000..3df30ded267d950ff3ca04cffb9660be12079ca6 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/latest @@ -0,0 +1 @@ +global_step160 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/rng_state_0.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..e31a2394e12bf431ae13288c3d90fe4727f07fa7 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:feb6462d333dbc5bb5e497ea9b0adb960f7616f79e6eea63222de6d5bd559516 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/rng_state_1.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d1db0a0f44aa3ac1d82c3bf8dc2d8968eeba4ce7 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b045e1bfa728f51c8b51ab0faa20b128a4fbd350da006b9b39a19e24abdf5a74 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/rng_state_2.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..75de18f57a056bd6a5f89df1abd045678f3f919e --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f76a3d058d2628a61848c2441d313f251278bd8f74ce43dc44d8cd8ad3e619a8 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/rng_state_3.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fd100693bc9f3267d044ce4a16e702502dc03ec --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7f72fc498e6eaa671cdc0e8a627a668b8ef607063a22ddb4edbc05e791be830 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/rng_state_4.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..5aeeabfe119f1cb0c8c804f1b9a4d3049f478d69 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12889af98e175b734a788f4c5b8c4da91dd61ff3a05aaf61b9d4c66aa3dd8ad6 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/rng_state_5.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..91fe0f42382ab06f4d26d753745a914c9e46100e --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe21a86abfceeac2cf2f48afd61a9a506cf61a287f3403f1adf391bb2ffa5a83 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/rng_state_6.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..5830ca6bd04645962b6e56a00a91cd8349ca449c --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73488bec91f9dee6d8105d06f99edaf4d27b6b064250d4c7023f33285b2f3132 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/rng_state_7.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..343d1c0475f0dc64100dc67b09195e047f1a7bcf --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edf6ee1cc2e1325b428a21172ec4e61b7220c5489751ea11c06bb66c77a0cd08 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/scheduler.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..229789af83e72e748f236450e9d2df977318d98a --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b659f5e1f39ab526587d47a9d305eeca96cdb1335d25ff0a7b9958f685604b4 +size 1064 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/trainer_state.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c2a60189b19ba8ddeca984639ea60c940fe5f59d --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/trainer_state.json @@ -0,0 +1,763 @@ +{ + "best_metric": 0.49707031, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160", + "epoch": 3.2, + "eval_steps": 20, + "global_step": 160, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 9.15866064284304, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": 2.390625, + "logits/rejected": -2.546875, + "logps/chosen": -520.0, + "logps/rejected": -142.0, + "loss": 2.4697265625, + "memory(GiB)": 8.4, + "nll_loss": 0.51171875, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.126463 + }, + { + "epoch": 0.1, + "grad_norm": 26.585759281524314, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 0.1533203125, + "logits/rejected": -0.8671875, + "logps/chosen": -728.0, + "logps/rejected": -304.0, + "loss": 2.3878173828125, + "memory(GiB)": 12.04, + "nll_loss": 1.1484375, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.125, + "rewards/margins": -0.0250244140625, + "rewards/rejected": 0.150390625, + "step": 5, + "train_speed(iter/s)": 0.276117 + }, + { + "epoch": 0.2, + "grad_norm": 8.664467296577401, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.419921875, + "logits/rejected": 1.2890625, + "logps/chosen": -644.0, + "logps/rejected": -572.0, + "loss": 2.18505859375, + "memory(GiB)": 17.87, + "nll_loss": 1.4921875, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.1103515625, + "rewards/margins": 0.08984375, + "rewards/rejected": 0.0201416015625, + "step": 10, + "train_speed(iter/s)": 0.313547 + }, + { + "epoch": 0.3, + "grad_norm": 6.325430646213694, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.7578125, + "logits/rejected": 1.8203125, + "logps/chosen": -924.0, + "logps/rejected": -1552.0, + "loss": 1.76845703125, + "memory(GiB)": 32.02, + "nll_loss": 1.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.75, + "rewards/margins": 2.390625, + "rewards/rejected": 0.359375, + "step": 15, + "train_speed(iter/s)": 0.318522 + }, + { + "epoch": 0.4, + "grad_norm": 2.351010309283972, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": 0.5859375, + "logits/rejected": -0.4375, + "logps/chosen": -432.0, + "logps/rejected": -324.0, + "loss": 1.172265625, + "memory(GiB)": 32.02, + "nll_loss": 0.6640625, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 4.65625, + "rewards/margins": 1.5546875, + "rewards/rejected": 3.09375, + "step": 20, + "train_speed(iter/s)": 0.331194 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.3125, + "eval_logits/rejected": 0.86328125, + "eval_logps/chosen": -14.3125, + "eval_logps/rejected": -182.0, + "eval_loss": 0.6728515625, + "eval_nll_loss": 0.62109375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.375, + "eval_rewards/margins": 3.5625, + "eval_rewards/rejected": 5.8125, + "eval_runtime": 1.2862, + "eval_samples_per_second": 3.11, + "eval_steps_per_second": 0.778, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.5984307422346672, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": 0.255859375, + "logits/rejected": -0.0390625, + "logps/chosen": -326.0, + "logps/rejected": -420.0, + "loss": 0.8047607421875, + "memory(GiB)": 32.02, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.125, + "rewards/margins": 5.40625, + "rewards/rejected": 3.6875, + "step": 25, + "train_speed(iter/s)": 0.330795 + }, + { + "epoch": 0.6, + "grad_norm": 0.5223734060204478, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -0.21875, + "logits/rejected": 1.546875, + "logps/chosen": -400.0, + "logps/rejected": -656.0, + "loss": 0.6901611328125, + "memory(GiB)": 32.02, + "nll_loss": 0.78125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.5625, + "rewards/margins": 6.78125, + "rewards/rejected": 3.765625, + "step": 30, + "train_speed(iter/s)": 0.339407 + }, + { + "epoch": 0.7, + "grad_norm": 1.0839186250481967, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": 1.5625, + "logits/rejected": -1.578125, + "logps/chosen": -592.0, + "logps/rejected": -322.0, + "loss": 0.7055419921875, + "memory(GiB)": 32.02, + "nll_loss": 0.96875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.75, + "rewards/margins": 9.625, + "rewards/rejected": 3.125, + "step": 35, + "train_speed(iter/s)": 0.346622 + }, + { + "epoch": 0.8, + "grad_norm": 13.025539300060226, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": 0.08740234375, + "logits/rejected": 0.71484375, + "logps/chosen": -225.0, + "logps/rejected": -322.0, + "loss": 0.6363037109375, + "memory(GiB)": 32.02, + "nll_loss": 0.56640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.0625, + "rewards/margins": 7.53125, + "rewards/rejected": 3.546875, + "step": 40, + "train_speed(iter/s)": 0.345794 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -0.8046875, + "eval_logits/rejected": 0.9375, + "eval_logps/chosen": -7.1875, + "eval_logps/rejected": -212.0, + "eval_loss": 0.5283203125, + "eval_nll_loss": 0.3125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 7.3125, + "eval_rewards/rejected": 2.796875, + "eval_runtime": 1.2905, + "eval_samples_per_second": 3.1, + "eval_steps_per_second": 0.775, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.49724108967226127, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": 2.890625, + "logits/rejected": 0.002349853515625, + "logps/chosen": -426.0, + "logps/rejected": -430.0, + "loss": 0.5437744140625, + "memory(GiB)": 32.02, + "nll_loss": 0.470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.0625, + "rewards/margins": 10.6875, + "rewards/rejected": 3.390625, + "step": 45, + "train_speed(iter/s)": 0.342553 + }, + { + "epoch": 1.0, + "grad_norm": 1.4155859018634052, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": 0.2451171875, + "logits/rejected": 1.75, + "logps/chosen": -804.0, + "logps/rejected": -1480.0, + "loss": 0.57803955078125, + "memory(GiB)": 32.02, + "nll_loss": 0.6953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.25, + "rewards/margins": 9.125, + "rewards/rejected": 7.125, + "step": 50, + "train_speed(iter/s)": 0.346029 + }, + { + "epoch": 1.1, + "grad_norm": 0.5694275670231188, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": 1.8359375, + "logits/rejected": 0.703125, + "logps/chosen": -386.0, + "logps/rejected": -612.0, + "loss": 0.6138671875, + "memory(GiB)": 43.66, + "nll_loss": 0.546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.75, + "rewards/margins": 12.25, + "rewards/rejected": 3.53125, + "step": 55, + "train_speed(iter/s)": 0.347089 + }, + { + "epoch": 1.2, + "grad_norm": 0.3472486708219598, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": 1.65625, + "logits/rejected": 2.15625, + "logps/chosen": -233.0, + "logps/rejected": -520.0, + "loss": 0.5795654296875, + "memory(GiB)": 43.66, + "nll_loss": 0.396484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.375, + "rewards/margins": 12.1875, + "rewards/rejected": 3.140625, + "step": 60, + "train_speed(iter/s)": 0.349928 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -0.23828125, + "eval_logits/rejected": 1.703125, + "eval_logps/chosen": -7.3125, + "eval_logps/rejected": -223.0, + "eval_loss": 0.51806640625, + "eval_nll_loss": 0.31640625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 8.4375, + "eval_rewards/rejected": 1.703125, + "eval_runtime": 1.2918, + "eval_samples_per_second": 3.096, + "eval_steps_per_second": 0.774, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.48830819519223906, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": 0.97265625, + "logits/rejected": 1.9921875, + "logps/chosen": -346.0, + "logps/rejected": -844.0, + "loss": 0.6193359375, + "memory(GiB)": 43.66, + "nll_loss": 0.640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5625, + "rewards/margins": 11.875, + "rewards/rejected": 3.703125, + "step": 65, + "train_speed(iter/s)": 0.34619 + }, + { + "epoch": 1.4, + "grad_norm": 0.3461001279478362, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -0.6875, + "logits/rejected": 2.28125, + "logps/chosen": -88.0, + "logps/rejected": -1120.0, + "loss": 0.5681396484375, + "memory(GiB)": 43.66, + "nll_loss": 0.84375, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 11.5625, + "rewards/margins": 9.125, + "rewards/rejected": 2.484375, + "step": 70, + "train_speed(iter/s)": 0.349389 + }, + { + "epoch": 1.5, + "grad_norm": 0.49738864814975586, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": 2.515625, + "logits/rejected": 0.263671875, + "logps/chosen": -462.0, + "logps/rejected": -434.0, + "loss": 0.517205810546875, + "memory(GiB)": 43.66, + "nll_loss": 0.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.625, + "rewards/margins": 16.375, + "rewards/rejected": 2.28125, + "step": 75, + "train_speed(iter/s)": 0.351345 + }, + { + "epoch": 1.6, + "grad_norm": 0.3498527388541882, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": 3.5625, + "logits/rejected": -1.296875, + "logps/chosen": -580.0, + "logps/rejected": -132.0, + "loss": 0.5629638671875, + "memory(GiB)": 43.66, + "nll_loss": 0.703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.5, + "rewards/margins": 15.6875, + "rewards/rejected": 5.84375, + "step": 80, + "train_speed(iter/s)": 0.353922 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -0.68359375, + "eval_logits/rejected": 0.3125, + "eval_logps/chosen": -6.3125, + "eval_logps/rejected": -306.0, + "eval_loss": 0.5009765625, + "eval_nll_loss": 0.2734375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.1875, + "eval_rewards/margins": 16.75, + "eval_rewards/rejected": -6.59375, + "eval_runtime": 1.3254, + "eval_samples_per_second": 3.018, + "eval_steps_per_second": 0.754, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.6649877164961882, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": 0.72265625, + "logits/rejected": 0.388671875, + "logps/chosen": -326.0, + "logps/rejected": -768.0, + "loss": 0.70977783203125, + "memory(GiB)": 43.66, + "nll_loss": 0.458984375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5, + "rewards/margins": 18.375, + "rewards/rejected": -2.859375, + "step": 85, + "train_speed(iter/s)": 0.351477 + }, + { + "epoch": 1.8, + "grad_norm": 0.5774391096810936, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": 2.15625, + "logits/rejected": 1.515625, + "logps/chosen": -290.0, + "logps/rejected": -520.0, + "loss": 0.5434326171875, + "memory(GiB)": 43.66, + "nll_loss": 0.56640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.75, + "rewards/margins": 14.25, + "rewards/rejected": 0.5390625, + "step": 90, + "train_speed(iter/s)": 0.351018 + }, + { + "epoch": 1.9, + "grad_norm": 0.46335079012099145, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": 1.65625, + "logits/rejected": 1.1640625, + "logps/chosen": -366.0, + "logps/rejected": -620.0, + "loss": 0.60682373046875, + "memory(GiB)": 43.66, + "nll_loss": 0.5, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.75, + "rewards/margins": 15.9375, + "rewards/rejected": 0.7890625, + "step": 95, + "train_speed(iter/s)": 0.351082 + }, + { + "epoch": 2.0, + "grad_norm": 0.44055183589508673, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": 0.59765625, + "logits/rejected": 0.1025390625, + "logps/chosen": -276.0, + "logps/rejected": -360.0, + "loss": 0.51856689453125, + "memory(GiB)": 43.66, + "nll_loss": 0.470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.6875, + "rewards/margins": 14.0, + "rewards/rejected": 1.65625, + "step": 100, + "train_speed(iter/s)": 0.350869 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -0.63671875, + "eval_logits/rejected": 0.66796875, + "eval_logps/chosen": -6.84375, + "eval_logps/rejected": -280.0, + "eval_loss": 0.50390625, + "eval_nll_loss": 0.298828125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 14.125, + "eval_rewards/rejected": -4.0, + "eval_runtime": 1.3254, + "eval_samples_per_second": 3.018, + "eval_steps_per_second": 0.754, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.43319830084665456, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": 3.5, + "logits/rejected": 0.185546875, + "logps/chosen": -442.0, + "logps/rejected": -580.0, + "loss": 0.540869140625, + "memory(GiB)": 43.66, + "nll_loss": 0.60546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.375, + "rewards/margins": 16.125, + "rewards/rejected": 4.25, + "step": 105, + "train_speed(iter/s)": 0.34793 + }, + { + "epoch": 2.2, + "grad_norm": 0.5087186205209226, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": 0.41015625, + "logits/rejected": 1.9375, + "logps/chosen": -688.0, + "logps/rejected": -1280.0, + "loss": 0.5629150390625, + "memory(GiB)": 43.66, + "nll_loss": 0.69140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.875, + "rewards/margins": 14.5, + "rewards/rejected": 6.34375, + "step": 110, + "train_speed(iter/s)": 0.34833 + }, + { + "epoch": 2.3, + "grad_norm": 0.30352973245628134, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": 1.546875, + "logits/rejected": 1.0625, + "logps/chosen": -568.0, + "logps/rejected": -704.0, + "loss": 0.596630859375, + "memory(GiB)": 43.66, + "nll_loss": 0.5390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.0, + "rewards/margins": 15.4375, + "rewards/rejected": 5.5, + "step": 115, + "train_speed(iter/s)": 0.349547 + }, + { + "epoch": 2.4, + "grad_norm": 0.5099680534446042, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": 2.40625, + "logits/rejected": 0.002349853515625, + "logps/chosen": -368.0, + "logps/rejected": -316.0, + "loss": 0.463055419921875, + "memory(GiB)": 43.66, + "nll_loss": 0.37109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.625, + "rewards/margins": 13.1875, + "rewards/rejected": 3.484375, + "step": 120, + "train_speed(iter/s)": 0.35126 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -0.96484375, + "eval_logits/rejected": 0.416015625, + "eval_logps/chosen": -6.4375, + "eval_logps/rejected": -280.0, + "eval_loss": 0.49755859375, + "eval_nll_loss": 0.279296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.1875, + "eval_rewards/margins": 14.1875, + "eval_rewards/rejected": -4.0, + "eval_runtime": 1.3007, + "eval_samples_per_second": 3.075, + "eval_steps_per_second": 0.769, + "step": 120 + }, + { + "epoch": 2.5, + "grad_norm": 0.6759212291558464, + "learning_rate": 5.430276272567485e-05, + "logits/chosen": -1.4140625, + "logits/rejected": 0.82421875, + "logps/chosen": -62.5, + "logps/rejected": -1288.0, + "loss": 0.5360504150390625, + "memory(GiB)": 43.66, + "nll_loss": 0.7265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.75, + "rewards/margins": 19.75, + "rewards/rejected": -7.96875, + "step": 125, + "train_speed(iter/s)": 0.350267 + }, + { + "epoch": 2.6, + "grad_norm": 0.3635118516286732, + "learning_rate": 5.0994109383253506e-05, + "logits/chosen": 2.3125, + "logits/rejected": 0.546875, + "logps/chosen": -348.0, + "logps/rejected": -314.0, + "loss": 0.4655029296875, + "memory(GiB)": 43.66, + "nll_loss": 0.484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.875, + "rewards/margins": 15.3125, + "rewards/rejected": 3.515625, + "step": 130, + "train_speed(iter/s)": 0.35164 + }, + { + "epoch": 2.7, + "grad_norm": 0.4546423377677983, + "learning_rate": 4.768109069909307e-05, + "logits/chosen": -0.375, + "logits/rejected": 0.88671875, + "logps/chosen": -157.0, + "logps/rejected": -624.0, + "loss": 0.44596214294433595, + "memory(GiB)": 43.66, + "nll_loss": 0.24609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.375, + "rewards/margins": 14.9375, + "rewards/rejected": -1.5625, + "step": 135, + "train_speed(iter/s)": 0.352504 + }, + { + "epoch": 2.8, + "grad_norm": 0.3766526906078867, + "learning_rate": 4.4378254829551396e-05, + "logits/chosen": 1.484375, + "logits/rejected": 1.1484375, + "logps/chosen": -392.0, + "logps/rejected": -708.0, + "loss": 0.4998992919921875, + "memory(GiB)": 43.66, + "nll_loss": 0.734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.625, + "rewards/margins": 17.0, + "rewards/rejected": 1.5546875, + "step": 140, + "train_speed(iter/s)": 0.353731 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -1.0546875, + "eval_logits/rejected": 0.28515625, + "eval_logps/chosen": -6.65625, + "eval_logps/rejected": -282.0, + "eval_loss": 0.49853515625, + "eval_nll_loss": 0.2890625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.1875, + "eval_rewards/margins": 14.375, + "eval_rewards/rejected": -4.1875, + "eval_runtime": 1.2976, + "eval_samples_per_second": 3.083, + "eval_steps_per_second": 0.771, + "step": 140 + }, + { + "epoch": 2.9, + "grad_norm": 0.7368475635982081, + "learning_rate": 4.11001052161225e-05, + "logits/chosen": 1.484375, + "logits/rejected": 0.7578125, + "logps/chosen": -624.0, + "logps/rejected": -664.0, + "loss": 0.5592437744140625, + "memory(GiB)": 43.66, + "nll_loss": 0.7421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.75, + "rewards/margins": 16.875, + "rewards/rejected": 4.90625, + "step": 145, + "train_speed(iter/s)": 0.350922 + }, + { + "epoch": 3.0, + "grad_norm": 0.526438371292198, + "learning_rate": 3.786103689779861e-05, + "logits/chosen": 2.53125, + "logits/rejected": -0.11962890625, + "logps/chosen": -406.0, + "logps/rejected": -310.0, + "loss": 0.4158843994140625, + "memory(GiB)": 43.66, + "nll_loss": 0.39453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.5, + "rewards/margins": 19.0, + "rewards/rejected": 3.5, + "step": 150, + "train_speed(iter/s)": 0.350374 + }, + { + "epoch": 3.1, + "grad_norm": 0.3832315044394523, + "learning_rate": 3.467527329945026e-05, + "logits/chosen": 1.1484375, + "logits/rejected": 1.328125, + "logps/chosen": -600.0, + "logps/rejected": -588.0, + "loss": 0.5336650848388672, + "memory(GiB)": 43.66, + "nll_loss": 0.515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.125, + "rewards/margins": 17.375, + "rewards/rejected": 1.7265625, + "step": 155, + "train_speed(iter/s)": 0.350973 + }, + { + "epoch": 3.2, + "grad_norm": 0.41804293149830996, + "learning_rate": 3.1556803773799614e-05, + "logits/chosen": 2.84375, + "logits/rejected": 0.10693359375, + "logps/chosen": -416.0, + "logps/rejected": -360.0, + "loss": 0.456103515625, + "memory(GiB)": 43.66, + "nll_loss": 0.38671875, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.5, + "rewards/margins": 16.75, + "rewards/rejected": 4.71875, + "step": 160, + "train_speed(iter/s)": 0.350912 + }, + { + "epoch": 3.2, + "eval_logits/chosen": -1.046875, + "eval_logits/rejected": 0.1669921875, + "eval_logps/chosen": -6.46875, + "eval_logps/rejected": -286.0, + "eval_loss": 0.4970703125, + "eval_nll_loss": 0.28125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.1875, + "eval_rewards/margins": 14.75, + "eval_rewards/rejected": -4.59375, + "eval_runtime": 1.2858, + "eval_samples_per_second": 3.111, + "eval_steps_per_second": 0.778, + "step": 160 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 140560176119808.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/training_args.bin b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..475462e40417630252a03e5a9352b5e11f000ed1 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:647dcffabf1a4c913f292b185978f2f179c4819a3df809536d284c803d9d97fe +size 8888 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/zero_to_fp32.py b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/README.md b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/README.md new file mode 100644 index 0000000000000000000000000000000000000000..525e0a2a5243ae77491e227cbf74fa8ff0a43579 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-7b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/adapter_config.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..454fa4ee7fc180fe11671ee7f6c90d0ba44328a6 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "gate_proj", + "up_proj", + "o_proj", + "v_proj", + "down_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/adapter_model.safetensors b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..100d4b35a132490189f7f0fb4dd7f3544421801f --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddec4fbe6211ad0c191622398575562fdf7ce9ae85bc502dbcc7fd4b2e24d09b +size 40422208 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/additional_config.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/args.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/args.json new file mode 100644 index 0000000000000000000000000000000000000000..4b016b97ef2d4589dae9d4623f25925cb829d241 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-7b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-7b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fd49e64212cf48a93d9c01740af3af40e7925a74 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6a8b572cdffa68d5718106d95c2dfe92e0e58d144ab4d885a1bc2a89a8ffa37 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0cfa99c0b62df68825eab2c1259f1271814f30d3 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30d4b71053bb077cc918d7bcb270deebdedf96001ec9500dd09372955696eea1 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9880d9450236d897383f34c322ea0560e95fabe0 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ce25355333b420d6f70f18f0c04b6bd34e41fcd657ffa2c75b80530f02fb617 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..77f0cae352058c3076bdc355195cf157044c5f21 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:550dcb653058194754b80ab22fe25a6b939bff0007bdae3200653677a8fa80e4 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c31364f1e4e85b20ee3b2e3c5a5ad296fd6901fc --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4200eb181912fc1455058c6a664a4e304bb70726e04683c303515b5978e5686b +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bc559b8030e2bece07163545a1aef2af0d35039e --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae0205610377d7159d78ed0f810541d4f5a4d551a4dcb6371afb0c86a37421cc +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6e3fb26f05c52050ad52b551c2ad4d230d667d09 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a2fe196e534e9d9f832de65c49985548d613967f70ad66d92bec5981634a5ec +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e42285bdb4630f51a3946c0246c5cdd5cbf9438 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bf8d57dd5462aec48551d06beae86a3ee85d2f6d798d0b5c5b391346a104a85 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f5586628d426b32e5339ec5dc0ea154b8e0fca78 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74e14e3d07a38fafc49b2a696577253142ff5cc04b6f7615ab32e78bae65bb40 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3b2284a2c53243af6f173abb24eac2b11f1309d7 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57945dafe4585b590ebe683e4a86f62906e4a01cc0d25f90314f29d2bea1f467 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..58bc4082284d671b2c750553fcb2d22a144f4f18 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84ed8ba189e9ebf7ba05c51b0757a2b2b5cc59f32d73a252b1d05b510c0b5075 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..362c19c860bfe596819ff7b1eb755a8e50b029e0 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95e2179e50b6a9a8a4b117e3c010ebe303022623b80a91c1edfef908f3c91332 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2a89327f2f25b8c9b382e9449496afaa8fcbede2 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1e4028828494ef9d09a70550f1f4bc6ec19c4e744ae63d55a6ed858ef643f9e +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..acedceb2e78a27e31d60f2ad0f95c09dc013a77c --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d85a18dea48a7b2ab6f42da3bfd8068e93465c3403b8df17581b31217758aa7 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..acee0dfe043c8ddf30d9113187145521062f38a8 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:232b9c6398d31a30571c1df0bcc045279295c239c5e64a6a3c4e5699564a6057 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..69c61c04f7090dfb92c9a0028c0ebb2305a0fe3b --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/global_step180/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20917583bbbb51c24a7ca7d11e122c8bdd5652edc914f49fa8d809996af186bc +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/latest b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/latest new file mode 100644 index 0000000000000000000000000000000000000000..eac7d625396c2750025575c77b8da5d622b0c7dc --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/latest @@ -0,0 +1 @@ +global_step180 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/rng_state_0.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..97f51b498d48145bd9cc14b35f8236b9ec95a4f7 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1bec598899f9d59e70c1b4705ce420a1e0a670957b6c8153a589880068ae5a4 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/rng_state_1.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..08e59ac81067b262a084604cd3392250166c2841 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c60d2348aae518f4c44693db9c9b4b3a3299c556e7f0a86c188b2e4c3e364a7c +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/rng_state_2.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..20a24c17b4be2ee59cd5e6682010519318a91e58 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffe5a79d3bcb4ce033de360bc765e616316e3562aba25887cd85c4adbb935abf +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/rng_state_3.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..54050f6cf8fb847e2a926e14a7aad2647761521a --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9a9d1f6e22677721841890e6a27855857e6840137650d609eb8e4ac13b71d29 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/rng_state_4.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..263aae475c49b090bce43f143308192c5bf9a95b --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcac4ff84388a6a4fe3bcae6207c68b2ee5528fb3b6de8cc3588fe1975462aa5 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/rng_state_5.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..942ed5d60ae87dce686b33da76a34db404036dc6 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33fce3cdf5c1b8a8a291e0c73b384e3ad5252640e21e942b44b26b8b0928ffa9 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/rng_state_6.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..57789be3df3983cb8acc1500bf6470ffadb1c578 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:919e675f3bcaf4f3c8ba35cd8debf85aec3bbc3c8e5019b74431e0a314e4d37a +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/rng_state_7.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..b32d6e2e7eb7148713b473b0c821a98e616ab6e6 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bf6479ce82b88efc6a72a8ee512162b3d0ecab972817296d38ab9c448bb8d96 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/scheduler.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..18942cfbbbc36710e196a20b862a745c9dcc2468 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0fa6cf7ac608af8ab72180ce60dcfa61b0bf4eeab8e185f70f65a95b45e6b7a +size 1064 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/trainer_state.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8f179373516f03f927cfe14ae72b71383e0c786a --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/trainer_state.json @@ -0,0 +1,852 @@ +{ + "best_metric": 0.49707031, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160", + "epoch": 3.6, + "eval_steps": 20, + "global_step": 180, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 9.15866064284304, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": 2.390625, + "logits/rejected": -2.546875, + "logps/chosen": -520.0, + "logps/rejected": -142.0, + "loss": 2.4697265625, + "memory(GiB)": 8.4, + "nll_loss": 0.51171875, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.126463 + }, + { + "epoch": 0.1, + "grad_norm": 26.585759281524314, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 0.1533203125, + "logits/rejected": -0.8671875, + "logps/chosen": -728.0, + "logps/rejected": -304.0, + "loss": 2.3878173828125, + "memory(GiB)": 12.04, + "nll_loss": 1.1484375, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.125, + "rewards/margins": -0.0250244140625, + "rewards/rejected": 0.150390625, + "step": 5, + "train_speed(iter/s)": 0.276117 + }, + { + "epoch": 0.2, + "grad_norm": 8.664467296577401, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.419921875, + "logits/rejected": 1.2890625, + "logps/chosen": -644.0, + "logps/rejected": -572.0, + "loss": 2.18505859375, + "memory(GiB)": 17.87, + "nll_loss": 1.4921875, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.1103515625, + "rewards/margins": 0.08984375, + "rewards/rejected": 0.0201416015625, + "step": 10, + "train_speed(iter/s)": 0.313547 + }, + { + "epoch": 0.3, + "grad_norm": 6.325430646213694, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.7578125, + "logits/rejected": 1.8203125, + "logps/chosen": -924.0, + "logps/rejected": -1552.0, + "loss": 1.76845703125, + "memory(GiB)": 32.02, + "nll_loss": 1.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.75, + "rewards/margins": 2.390625, + "rewards/rejected": 0.359375, + "step": 15, + "train_speed(iter/s)": 0.318522 + }, + { + "epoch": 0.4, + "grad_norm": 2.351010309283972, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": 0.5859375, + "logits/rejected": -0.4375, + "logps/chosen": -432.0, + "logps/rejected": -324.0, + "loss": 1.172265625, + "memory(GiB)": 32.02, + "nll_loss": 0.6640625, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 4.65625, + "rewards/margins": 1.5546875, + "rewards/rejected": 3.09375, + "step": 20, + "train_speed(iter/s)": 0.331194 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.3125, + "eval_logits/rejected": 0.86328125, + "eval_logps/chosen": -14.3125, + "eval_logps/rejected": -182.0, + "eval_loss": 0.6728515625, + "eval_nll_loss": 0.62109375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.375, + "eval_rewards/margins": 3.5625, + "eval_rewards/rejected": 5.8125, + "eval_runtime": 1.2862, + "eval_samples_per_second": 3.11, + "eval_steps_per_second": 0.778, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.5984307422346672, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": 0.255859375, + "logits/rejected": -0.0390625, + "logps/chosen": -326.0, + "logps/rejected": -420.0, + "loss": 0.8047607421875, + "memory(GiB)": 32.02, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.125, + "rewards/margins": 5.40625, + "rewards/rejected": 3.6875, + "step": 25, + "train_speed(iter/s)": 0.330795 + }, + { + "epoch": 0.6, + "grad_norm": 0.5223734060204478, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -0.21875, + "logits/rejected": 1.546875, + "logps/chosen": -400.0, + "logps/rejected": -656.0, + "loss": 0.6901611328125, + "memory(GiB)": 32.02, + "nll_loss": 0.78125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.5625, + "rewards/margins": 6.78125, + "rewards/rejected": 3.765625, + "step": 30, + "train_speed(iter/s)": 0.339407 + }, + { + "epoch": 0.7, + "grad_norm": 1.0839186250481967, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": 1.5625, + "logits/rejected": -1.578125, + "logps/chosen": -592.0, + "logps/rejected": -322.0, + "loss": 0.7055419921875, + "memory(GiB)": 32.02, + "nll_loss": 0.96875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.75, + "rewards/margins": 9.625, + "rewards/rejected": 3.125, + "step": 35, + "train_speed(iter/s)": 0.346622 + }, + { + "epoch": 0.8, + "grad_norm": 13.025539300060226, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": 0.08740234375, + "logits/rejected": 0.71484375, + "logps/chosen": -225.0, + "logps/rejected": -322.0, + "loss": 0.6363037109375, + "memory(GiB)": 32.02, + "nll_loss": 0.56640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.0625, + "rewards/margins": 7.53125, + "rewards/rejected": 3.546875, + "step": 40, + "train_speed(iter/s)": 0.345794 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -0.8046875, + "eval_logits/rejected": 0.9375, + "eval_logps/chosen": -7.1875, + "eval_logps/rejected": -212.0, + "eval_loss": 0.5283203125, + "eval_nll_loss": 0.3125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 7.3125, + "eval_rewards/rejected": 2.796875, + "eval_runtime": 1.2905, + "eval_samples_per_second": 3.1, + "eval_steps_per_second": 0.775, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.49724108967226127, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": 2.890625, + "logits/rejected": 0.002349853515625, + "logps/chosen": -426.0, + "logps/rejected": -430.0, + "loss": 0.5437744140625, + "memory(GiB)": 32.02, + "nll_loss": 0.470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.0625, + "rewards/margins": 10.6875, + "rewards/rejected": 3.390625, + "step": 45, + "train_speed(iter/s)": 0.342553 + }, + { + "epoch": 1.0, + "grad_norm": 1.4155859018634052, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": 0.2451171875, + "logits/rejected": 1.75, + "logps/chosen": -804.0, + "logps/rejected": -1480.0, + "loss": 0.57803955078125, + "memory(GiB)": 32.02, + "nll_loss": 0.6953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.25, + "rewards/margins": 9.125, + "rewards/rejected": 7.125, + "step": 50, + "train_speed(iter/s)": 0.346029 + }, + { + "epoch": 1.1, + "grad_norm": 0.5694275670231188, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": 1.8359375, + "logits/rejected": 0.703125, + "logps/chosen": -386.0, + "logps/rejected": -612.0, + "loss": 0.6138671875, + "memory(GiB)": 43.66, + "nll_loss": 0.546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.75, + "rewards/margins": 12.25, + "rewards/rejected": 3.53125, + "step": 55, + "train_speed(iter/s)": 0.347089 + }, + { + "epoch": 1.2, + "grad_norm": 0.3472486708219598, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": 1.65625, + "logits/rejected": 2.15625, + "logps/chosen": -233.0, + "logps/rejected": -520.0, + "loss": 0.5795654296875, + "memory(GiB)": 43.66, + "nll_loss": 0.396484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.375, + "rewards/margins": 12.1875, + "rewards/rejected": 3.140625, + "step": 60, + "train_speed(iter/s)": 0.349928 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -0.23828125, + "eval_logits/rejected": 1.703125, + "eval_logps/chosen": -7.3125, + "eval_logps/rejected": -223.0, + "eval_loss": 0.51806640625, + "eval_nll_loss": 0.31640625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 8.4375, + "eval_rewards/rejected": 1.703125, + "eval_runtime": 1.2918, + "eval_samples_per_second": 3.096, + "eval_steps_per_second": 0.774, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.48830819519223906, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": 0.97265625, + "logits/rejected": 1.9921875, + "logps/chosen": -346.0, + "logps/rejected": -844.0, + "loss": 0.6193359375, + "memory(GiB)": 43.66, + "nll_loss": 0.640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5625, + "rewards/margins": 11.875, + "rewards/rejected": 3.703125, + "step": 65, + "train_speed(iter/s)": 0.34619 + }, + { + "epoch": 1.4, + "grad_norm": 0.3461001279478362, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -0.6875, + "logits/rejected": 2.28125, + "logps/chosen": -88.0, + "logps/rejected": -1120.0, + "loss": 0.5681396484375, + "memory(GiB)": 43.66, + "nll_loss": 0.84375, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 11.5625, + "rewards/margins": 9.125, + "rewards/rejected": 2.484375, + "step": 70, + "train_speed(iter/s)": 0.349389 + }, + { + "epoch": 1.5, + "grad_norm": 0.49738864814975586, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": 2.515625, + "logits/rejected": 0.263671875, + "logps/chosen": -462.0, + "logps/rejected": -434.0, + "loss": 0.517205810546875, + "memory(GiB)": 43.66, + "nll_loss": 0.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.625, + "rewards/margins": 16.375, + "rewards/rejected": 2.28125, + "step": 75, + "train_speed(iter/s)": 0.351345 + }, + { + "epoch": 1.6, + "grad_norm": 0.3498527388541882, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": 3.5625, + "logits/rejected": -1.296875, + "logps/chosen": -580.0, + "logps/rejected": -132.0, + "loss": 0.5629638671875, + "memory(GiB)": 43.66, + "nll_loss": 0.703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.5, + "rewards/margins": 15.6875, + "rewards/rejected": 5.84375, + "step": 80, + "train_speed(iter/s)": 0.353922 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -0.68359375, + "eval_logits/rejected": 0.3125, + "eval_logps/chosen": -6.3125, + "eval_logps/rejected": -306.0, + "eval_loss": 0.5009765625, + "eval_nll_loss": 0.2734375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.1875, + "eval_rewards/margins": 16.75, + "eval_rewards/rejected": -6.59375, + "eval_runtime": 1.3254, + "eval_samples_per_second": 3.018, + "eval_steps_per_second": 0.754, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.6649877164961882, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": 0.72265625, + "logits/rejected": 0.388671875, + "logps/chosen": -326.0, + "logps/rejected": -768.0, + "loss": 0.70977783203125, + "memory(GiB)": 43.66, + "nll_loss": 0.458984375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5, + "rewards/margins": 18.375, + "rewards/rejected": -2.859375, + "step": 85, + "train_speed(iter/s)": 0.351477 + }, + { + "epoch": 1.8, + "grad_norm": 0.5774391096810936, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": 2.15625, + "logits/rejected": 1.515625, + "logps/chosen": -290.0, + "logps/rejected": -520.0, + "loss": 0.5434326171875, + "memory(GiB)": 43.66, + "nll_loss": 0.56640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.75, + "rewards/margins": 14.25, + "rewards/rejected": 0.5390625, + "step": 90, + "train_speed(iter/s)": 0.351018 + }, + { + "epoch": 1.9, + "grad_norm": 0.46335079012099145, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": 1.65625, + "logits/rejected": 1.1640625, + "logps/chosen": -366.0, + "logps/rejected": -620.0, + "loss": 0.60682373046875, + "memory(GiB)": 43.66, + "nll_loss": 0.5, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.75, + "rewards/margins": 15.9375, + "rewards/rejected": 0.7890625, + "step": 95, + "train_speed(iter/s)": 0.351082 + }, + { + "epoch": 2.0, + "grad_norm": 0.44055183589508673, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": 0.59765625, + "logits/rejected": 0.1025390625, + "logps/chosen": -276.0, + "logps/rejected": -360.0, + "loss": 0.51856689453125, + "memory(GiB)": 43.66, + "nll_loss": 0.470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.6875, + "rewards/margins": 14.0, + "rewards/rejected": 1.65625, + "step": 100, + "train_speed(iter/s)": 0.350869 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -0.63671875, + "eval_logits/rejected": 0.66796875, + "eval_logps/chosen": -6.84375, + "eval_logps/rejected": -280.0, + "eval_loss": 0.50390625, + "eval_nll_loss": 0.298828125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 14.125, + "eval_rewards/rejected": -4.0, + "eval_runtime": 1.3254, + "eval_samples_per_second": 3.018, + "eval_steps_per_second": 0.754, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.43319830084665456, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": 3.5, + "logits/rejected": 0.185546875, + "logps/chosen": -442.0, + "logps/rejected": -580.0, + "loss": 0.540869140625, + "memory(GiB)": 43.66, + "nll_loss": 0.60546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.375, + "rewards/margins": 16.125, + "rewards/rejected": 4.25, + "step": 105, + "train_speed(iter/s)": 0.34793 + }, + { + "epoch": 2.2, + "grad_norm": 0.5087186205209226, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": 0.41015625, + "logits/rejected": 1.9375, + "logps/chosen": -688.0, + "logps/rejected": -1280.0, + "loss": 0.5629150390625, + "memory(GiB)": 43.66, + "nll_loss": 0.69140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.875, + "rewards/margins": 14.5, + "rewards/rejected": 6.34375, + "step": 110, + "train_speed(iter/s)": 0.34833 + }, + { + "epoch": 2.3, + "grad_norm": 0.30352973245628134, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": 1.546875, + "logits/rejected": 1.0625, + "logps/chosen": -568.0, + "logps/rejected": -704.0, + "loss": 0.596630859375, + "memory(GiB)": 43.66, + "nll_loss": 0.5390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.0, + "rewards/margins": 15.4375, + "rewards/rejected": 5.5, + "step": 115, + "train_speed(iter/s)": 0.349547 + }, + { + "epoch": 2.4, + "grad_norm": 0.5099680534446042, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": 2.40625, + "logits/rejected": 0.002349853515625, + "logps/chosen": -368.0, + "logps/rejected": -316.0, + "loss": 0.463055419921875, + "memory(GiB)": 43.66, + "nll_loss": 0.37109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.625, + "rewards/margins": 13.1875, + "rewards/rejected": 3.484375, + "step": 120, + "train_speed(iter/s)": 0.35126 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -0.96484375, + "eval_logits/rejected": 0.416015625, + "eval_logps/chosen": -6.4375, + "eval_logps/rejected": -280.0, + "eval_loss": 0.49755859375, + "eval_nll_loss": 0.279296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.1875, + "eval_rewards/margins": 14.1875, + "eval_rewards/rejected": -4.0, + "eval_runtime": 1.3007, + "eval_samples_per_second": 3.075, + "eval_steps_per_second": 0.769, + "step": 120 + }, + { + "epoch": 2.5, + "grad_norm": 0.6759212291558464, + "learning_rate": 5.430276272567485e-05, + "logits/chosen": -1.4140625, + "logits/rejected": 0.82421875, + "logps/chosen": -62.5, + "logps/rejected": -1288.0, + "loss": 0.5360504150390625, + "memory(GiB)": 43.66, + "nll_loss": 0.7265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.75, + "rewards/margins": 19.75, + "rewards/rejected": -7.96875, + "step": 125, + "train_speed(iter/s)": 0.350267 + }, + { + "epoch": 2.6, + "grad_norm": 0.3635118516286732, + "learning_rate": 5.0994109383253506e-05, + "logits/chosen": 2.3125, + "logits/rejected": 0.546875, + "logps/chosen": -348.0, + "logps/rejected": -314.0, + "loss": 0.4655029296875, + "memory(GiB)": 43.66, + "nll_loss": 0.484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.875, + "rewards/margins": 15.3125, + "rewards/rejected": 3.515625, + "step": 130, + "train_speed(iter/s)": 0.35164 + }, + { + "epoch": 2.7, + "grad_norm": 0.4546423377677983, + "learning_rate": 4.768109069909307e-05, + "logits/chosen": -0.375, + "logits/rejected": 0.88671875, + "logps/chosen": -157.0, + "logps/rejected": -624.0, + "loss": 0.44596214294433595, + "memory(GiB)": 43.66, + "nll_loss": 0.24609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.375, + "rewards/margins": 14.9375, + "rewards/rejected": -1.5625, + "step": 135, + "train_speed(iter/s)": 0.352504 + }, + { + "epoch": 2.8, + "grad_norm": 0.3766526906078867, + "learning_rate": 4.4378254829551396e-05, + "logits/chosen": 1.484375, + "logits/rejected": 1.1484375, + "logps/chosen": -392.0, + "logps/rejected": -708.0, + "loss": 0.4998992919921875, + "memory(GiB)": 43.66, + "nll_loss": 0.734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.625, + "rewards/margins": 17.0, + "rewards/rejected": 1.5546875, + "step": 140, + "train_speed(iter/s)": 0.353731 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -1.0546875, + "eval_logits/rejected": 0.28515625, + "eval_logps/chosen": -6.65625, + "eval_logps/rejected": -282.0, + "eval_loss": 0.49853515625, + "eval_nll_loss": 0.2890625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.1875, + "eval_rewards/margins": 14.375, + "eval_rewards/rejected": -4.1875, + "eval_runtime": 1.2976, + "eval_samples_per_second": 3.083, + "eval_steps_per_second": 0.771, + "step": 140 + }, + { + "epoch": 2.9, + "grad_norm": 0.7368475635982081, + "learning_rate": 4.11001052161225e-05, + "logits/chosen": 1.484375, + "logits/rejected": 0.7578125, + "logps/chosen": -624.0, + "logps/rejected": -664.0, + "loss": 0.5592437744140625, + "memory(GiB)": 43.66, + "nll_loss": 0.7421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.75, + "rewards/margins": 16.875, + "rewards/rejected": 4.90625, + "step": 145, + "train_speed(iter/s)": 0.350922 + }, + { + "epoch": 3.0, + "grad_norm": 0.526438371292198, + "learning_rate": 3.786103689779861e-05, + "logits/chosen": 2.53125, + "logits/rejected": -0.11962890625, + "logps/chosen": -406.0, + "logps/rejected": -310.0, + "loss": 0.4158843994140625, + "memory(GiB)": 43.66, + "nll_loss": 0.39453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.5, + "rewards/margins": 19.0, + "rewards/rejected": 3.5, + "step": 150, + "train_speed(iter/s)": 0.350374 + }, + { + "epoch": 3.1, + "grad_norm": 0.3832315044394523, + "learning_rate": 3.467527329945026e-05, + "logits/chosen": 1.1484375, + "logits/rejected": 1.328125, + "logps/chosen": -600.0, + "logps/rejected": -588.0, + "loss": 0.5336650848388672, + "memory(GiB)": 43.66, + "nll_loss": 0.515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.125, + "rewards/margins": 17.375, + "rewards/rejected": 1.7265625, + "step": 155, + "train_speed(iter/s)": 0.350973 + }, + { + "epoch": 3.2, + "grad_norm": 0.41804293149830996, + "learning_rate": 3.1556803773799614e-05, + "logits/chosen": 2.84375, + "logits/rejected": 0.10693359375, + "logps/chosen": -416.0, + "logps/rejected": -360.0, + "loss": 0.456103515625, + "memory(GiB)": 43.66, + "nll_loss": 0.38671875, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.5, + "rewards/margins": 16.75, + "rewards/rejected": 4.71875, + "step": 160, + "train_speed(iter/s)": 0.350912 + }, + { + "epoch": 3.2, + "eval_logits/chosen": -1.046875, + "eval_logits/rejected": 0.1669921875, + "eval_logps/chosen": -6.46875, + "eval_logps/rejected": -286.0, + "eval_loss": 0.4970703125, + "eval_nll_loss": 0.28125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.1875, + "eval_rewards/margins": 14.75, + "eval_rewards/rejected": -4.59375, + "eval_runtime": 1.2858, + "eval_samples_per_second": 3.111, + "eval_steps_per_second": 0.778, + "step": 160 + }, + { + "epoch": 3.3, + "grad_norm": 0.9977986802164632, + "learning_rate": 2.8519322171253602e-05, + "logits/chosen": -0.173828125, + "logits/rejected": 0.234375, + "logps/chosen": -145.0, + "logps/rejected": -498.0, + "loss": 0.4285240173339844, + "memory(GiB)": 43.66, + "nll_loss": 0.16796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.625, + "rewards/margins": 18.5, + "rewards/rejected": -1.7890625, + "step": 165, + "train_speed(iter/s)": 0.350263 + }, + { + "epoch": 3.4, + "grad_norm": 0.5073480559704879, + "learning_rate": 2.5576166707349385e-05, + "logits/chosen": 3.234375, + "logits/rejected": -0.92578125, + "logps/chosen": -640.0, + "logps/rejected": -207.0, + "loss": 0.45649566650390627, + "memory(GiB)": 43.66, + "nll_loss": 0.453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 25.0, + "rewards/margins": 19.125, + "rewards/rejected": 5.875, + "step": 170, + "train_speed(iter/s)": 0.35017 + }, + { + "epoch": 3.5, + "grad_norm": 0.5295452954524565, + "learning_rate": 2.2740261391866637e-05, + "logits/chosen": 0.498046875, + "logits/rejected": 0.392578125, + "logps/chosen": -466.0, + "logps/rejected": -592.0, + "loss": 0.39521121978759766, + "memory(GiB)": 43.66, + "nll_loss": 0.65625, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.75, + "rewards/margins": 14.75, + "rewards/rejected": 7.03125, + "step": 175, + "train_speed(iter/s)": 0.350866 + }, + { + "epoch": 3.6, + "grad_norm": 0.30567037033016853, + "learning_rate": 2.002405927680374e-05, + "logits/chosen": 3.09375, + "logits/rejected": -0.09326171875, + "logps/chosen": -482.0, + "logps/rejected": -262.0, + "loss": 0.500946044921875, + "memory(GiB)": 43.66, + "nll_loss": 0.546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.25, + "rewards/margins": 17.125, + "rewards/rejected": 4.125, + "step": 180, + "train_speed(iter/s)": 0.351966 + }, + { + "epoch": 3.6, + "eval_logits/chosen": -1.1640625, + "eval_logits/rejected": 0.060546875, + "eval_logps/chosen": -7.0, + "eval_logps/rejected": -284.0, + "eval_loss": 0.501953125, + "eval_nll_loss": 0.302734375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 14.5, + "eval_rewards/rejected": -4.40625, + "eval_runtime": 1.2632, + "eval_samples_per_second": 3.167, + "eval_steps_per_second": 0.792, + "step": 180 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 158140817080320.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/training_args.bin b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..475462e40417630252a03e5a9352b5e11f000ed1 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:647dcffabf1a4c913f292b185978f2f179c4819a3df809536d284c803d9d97fe +size 8888 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/zero_to_fp32.py b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-180/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/README.md b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/README.md new file mode 100644 index 0000000000000000000000000000000000000000..525e0a2a5243ae77491e227cbf74fa8ff0a43579 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-7b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/adapter_config.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..454fa4ee7fc180fe11671ee7f6c90d0ba44328a6 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "gate_proj", + "up_proj", + "o_proj", + "v_proj", + "down_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/adapter_model.safetensors b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c29f8d025801c76a93e14c2ae20f49c38380be98 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:291aed8ff9dfd6b3ed1cd6662e4ff418391b904f319c9a44cd663abc92763ce4 +size 40422208 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/additional_config.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/args.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/args.json new file mode 100644 index 0000000000000000000000000000000000000000..4b016b97ef2d4589dae9d4623f25925cb829d241 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-7b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-7b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3cf9dbaca4790450c98fe40cf787e7b08cb870d1 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0f5ef5a0758793583cd847c7b0d533d50651cf41b2011931b9a7039d2a6c990 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0814e791b3bc4582894ca832500e45fd4e7df533 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31917b547f81c9793344812109ab7b370cd7e89ae5ee4703de0ad709b30db96e +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..72d1fda720550df4468186d965789eb03e8b7992 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b76f686d07003b8437a8fc516d3870edb7ac24a1ce792fe917875c37868aef57 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..15cd16734d43246fa8f9375c8e7f40dff1a32cc9 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6726b264d970553fbe2a4c5d3a6e2406b1c7562ea889a675d51c9e64c21ed131 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aaacfd035c7dce802fcedc32e1749c6eb8c58bc2 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c259c90292087f77290ee1cb85f20493a56fb17a6d17fec7cd0f0ead2396373 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..70188a82b2d11d7e579836bf4265fe328513e5fd --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:110d56e442caeff8326e0dbf1c96b20eadb0f557952a55fd0219fc854025bab0 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..775d5b60eac113b83aedebbc765e0b332771dd27 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:852ed70c1c1e8d584fad655bdf4fe4d04b2d2f4216fbc45f4400bea083f26449 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..273e79f8f8cd76794aed23878a680c0ec5e2c495 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8650c83f8609a6b9c621b153db1f3eb8f7c2a28669a55c176663b13c8d85f1df +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b4a6494c49c7c4cbda3956b2b23841f36a7eab3a --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17a80a68e8480c299a64f918d4f8a6ca8ca80428272548b81fbd0c00d5a39b88 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..24e57c93d3b8d2f6b792125cdaf15da4d2ad8f06 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e02dc35ad03936993c498420ccd56b721fe96db4f2437984938b31aaaa89c5a2 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2326f198fbb102d129354f9a03e95b9a98301f29 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ffe7486db1ed141a40e3ef4a7c68eaab1b3c511a8d22b177137c97f5a51396a +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d9b4c8822f57a9828ef42348f06a457638b49daa --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71ecdc4491c512526f78f7ff5f9b5c85e170e06f3ccf43f16938549754853539 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..69ef14583c92c7e5496c80f8df5ebe9d80ed8ed3 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bbc184186809d4e9c968e56e3fae462cfde10b4128b9ee80f4e827d1563759b +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b820cd220bdccaed64d12e93f0b660d1e4dcc58c --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7af0d4d721f59029aee32d0aa04788a158f13d0f6aab4935f339f7f285178000 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a38b55e5615746049d3c59a85f3ab9fcd9e97674 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e9d44aea5bbea403a76563da5ea43501d0d6b5b714069b0168d7f41df724bb4 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..46dde90b2c1b6beef5ebfaa151cc779c4b151626 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/global_step20/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce803b3ea768b8239e5839639df5729e31e14d464e4b2e67db1aa0d6187de2b7 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/latest b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/latest new file mode 100644 index 0000000000000000000000000000000000000000..11e5c63223cdf01f44f9f3129915f9de3d647f31 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/latest @@ -0,0 +1 @@ +global_step20 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/rng_state_0.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..b346349ce12dd5a17d4b91ed2a5722bb52550950 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad8a35afd8967cbb748405387e44426e43ad127028e826eddc9b67d2ca873c85 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/rng_state_1.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..68f3c6994456cb8d0592a5375d99503c8924b1c4 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f338ce80d7c441076bfc8c53b84067a0181f5a14e80c13d5acb8150b659f4d73 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/rng_state_2.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..be044f6ceeed587d30e80c2f72d5aa19fdc9947b --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9fbc9fa428939be10b46779f0eb5cd833e0da426b1cbdee77b3a55b6952235b +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/rng_state_3.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..fc825249656a9b858782542bd3f4386250f1dfe0 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac55dba0b79d5fa4699d239da2f966d52040d576d31234ac8d4632e6956481bc +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/rng_state_4.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..d30f52a44be563c152ae09db6ae934da6da0d3ed --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af2d0c015100768ffa23faf3b6c2d54ea89eb045603e30e55cd211e06ff34972 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/rng_state_5.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..c8715d27ab23ae545d58039cf949cc44ecc1da5e --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c60a1b40608e34bc801c8231f97b81c53b5290dfaed1b9cd0ccbeca29574a991 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/rng_state_6.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..1ed791b6ef76eadf0b0c55a5733411771e2ae027 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ad6a142a403eb9aafc4a3a9a856bca648fe31fd22d796867baca31fb13656aa +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/rng_state_7.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..800c3bbbc5edf7db01a8316069d439c5fb8d8c30 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38bc23a138cc800b22881742c0f3f9a71731a9a7111c6058a0077e6274d21773 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/scheduler.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2729ff9a97436d6c8ad743637f529065140ad3f1 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2e1cf73eea4791075e839e628da180bf39e1e01fcc1630f4ac9c723d8793968 +size 1064 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/trainer_state.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1430fa159312a6647d33b59ced395846a2f1a0d4 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/trainer_state.json @@ -0,0 +1,140 @@ +{ + "best_metric": 0.67285156, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20", + "epoch": 0.4, + "eval_steps": 20, + "global_step": 20, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 9.15866064284304, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": 2.390625, + "logits/rejected": -2.546875, + "logps/chosen": -520.0, + "logps/rejected": -142.0, + "loss": 2.4697265625, + "memory(GiB)": 8.4, + "nll_loss": 0.51171875, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.126463 + }, + { + "epoch": 0.1, + "grad_norm": 26.585759281524314, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 0.1533203125, + "logits/rejected": -0.8671875, + "logps/chosen": -728.0, + "logps/rejected": -304.0, + "loss": 2.3878173828125, + "memory(GiB)": 12.04, + "nll_loss": 1.1484375, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.125, + "rewards/margins": -0.0250244140625, + "rewards/rejected": 0.150390625, + "step": 5, + "train_speed(iter/s)": 0.276117 + }, + { + "epoch": 0.2, + "grad_norm": 8.664467296577401, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.419921875, + "logits/rejected": 1.2890625, + "logps/chosen": -644.0, + "logps/rejected": -572.0, + "loss": 2.18505859375, + "memory(GiB)": 17.87, + "nll_loss": 1.4921875, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.1103515625, + "rewards/margins": 0.08984375, + "rewards/rejected": 0.0201416015625, + "step": 10, + "train_speed(iter/s)": 0.313547 + }, + { + "epoch": 0.3, + "grad_norm": 6.325430646213694, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.7578125, + "logits/rejected": 1.8203125, + "logps/chosen": -924.0, + "logps/rejected": -1552.0, + "loss": 1.76845703125, + "memory(GiB)": 32.02, + "nll_loss": 1.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.75, + "rewards/margins": 2.390625, + "rewards/rejected": 0.359375, + "step": 15, + "train_speed(iter/s)": 0.318522 + }, + { + "epoch": 0.4, + "grad_norm": 2.351010309283972, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": 0.5859375, + "logits/rejected": -0.4375, + "logps/chosen": -432.0, + "logps/rejected": -324.0, + "loss": 1.172265625, + "memory(GiB)": 32.02, + "nll_loss": 0.6640625, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 4.65625, + "rewards/margins": 1.5546875, + "rewards/rejected": 3.09375, + "step": 20, + "train_speed(iter/s)": 0.331194 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.3125, + "eval_logits/rejected": 0.86328125, + "eval_logps/chosen": -14.3125, + "eval_logps/rejected": -182.0, + "eval_loss": 0.6728515625, + "eval_nll_loss": 0.62109375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.375, + "eval_rewards/margins": 3.5625, + "eval_rewards/rejected": 5.8125, + "eval_runtime": 1.2862, + "eval_samples_per_second": 3.11, + "eval_steps_per_second": 0.778, + "step": 20 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 17829702533120.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/training_args.bin b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..475462e40417630252a03e5a9352b5e11f000ed1 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:647dcffabf1a4c913f292b185978f2f179c4819a3df809536d284c803d9d97fe +size 8888 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/zero_to_fp32.py b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-20/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/README.md b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..525e0a2a5243ae77491e227cbf74fa8ff0a43579 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-7b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/adapter_config.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..454fa4ee7fc180fe11671ee7f6c90d0ba44328a6 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "gate_proj", + "up_proj", + "o_proj", + "v_proj", + "down_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/adapter_model.safetensors b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1c4be14f4da0c078c3236a25aa831f9bb288720c --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39ace24cf2048e4349cf8091096a1d07f4f9b47773772b072ff65c7dd2186db6 +size 40422208 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/additional_config.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/args.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/args.json new file mode 100644 index 0000000000000000000000000000000000000000..4b016b97ef2d4589dae9d4623f25925cb829d241 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-7b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-7b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..55207fe16f1888819f7908d96fbe9a93f018c5c5 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eabef1120bbc83015af316a9f045741ff79d1e0fe15f1a8203e2a08c605aedcc +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d8ac16efa4d15e9552860cf1a25ef409027e5e20 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b82155707109d1f21334a47c232a0dc3abfedcd5800a701986a0f9789060dc7a +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..245e8b42f56877968ed0b661bc469f61b3a6e78b --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba2383e1a149adcb5e3624801703e7e84500345ace99d901f54af94862c65654 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a4fc308650618ebb95b06d8466b57cfc92700ea --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7c4faea69c7323c24ca61a58973b25a0984858d6ae997a863ec39c8422c0401 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4fca6c4496800bb92aa3b16ffddcdfa6a3bb7696 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:031eb08cc5e3c069779c60e705b0fcf561e56cc133e2a76a50a4212f7d408b2b +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aa2c1f84fcfb50b7c082edc111f35b4c57d7d448 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4d8697ac5e85fdc58a443777d21d0d46da7f674ba3ff0120bfe3d8b543326c1 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c4b62174314ad7e8899e7f273d2f02ad77291046 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0647fe249f6b79bded9f458e656eb491e36674b8b04ccad01dab7cd1c49ebe2a +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2dfc13f3bf50be6875e089b509755a8021a0494d --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16488bb9823f42388fb81613ac3ce751e3ea4f93c6769843b1b383ce9e410262 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1bc5bc56f73205ba2135799336f0a7064e6694f5 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d2a388ed8491a7e5274a5ccc99b82985d3682a4ce2643835dd82277662903eb +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dc0415f17b46eb6dd75e181662f165659c6c50e9 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc3afc37bd93245278225dcb9c2336355cab40acb3105dcef387cda72907f080 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8293434fa15b9d7325503fdd49ac3bd45f93a11c --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c19feceddb49872d5f8500aeff4b1c46b1a23261312d486370cffa90db7118e3 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2af427df6cee54eed634163856903f166c793edb --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16a79da37b931a69ee6858284aef9624fe8ab02ee4de05d0508a0eb1d7147280 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d566aa8907901899f3d895e1f828e8d6798ecde2 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec924415199c78d221670613e828b15c1e0ad777a926c8886c704248f3a0308c +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..93c48f2d72d24083ec946d0909156a8ef69b7f31 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b17871ca5bfaf834398459adcf237962206a1c2d019934562e107ce86d13fd17 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b2abccf5e43bc561922b39435ad6ec867a966aad --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7bd5d3c521ea3926eb86d8d41e11dea82cb485b896f7cd79b6b2cf5ac82d3a3 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1df47acac618a9ee1dfaef3b2b677bac7c5373e7 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/global_step200/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e33060d56899d8146e275ca467ddc1e6f46f0d77b62f9c089c53343ed5962d9c +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/latest b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/latest new file mode 100644 index 0000000000000000000000000000000000000000..753e24e10f3a2489150f458205cf759fd8b6081f --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/latest @@ -0,0 +1 @@ +global_step200 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/rng_state_0.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..584f4a4a43f100f35696d7314a633631af587f25 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7891ffa7c7dae99113aa986d67278b52b8c57db55001dc3547a61f24569a34ee +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/rng_state_1.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..05b027a867e5e9cebd446293ecff82cfb240cc76 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8b92875cb04deec367605433847d1bda444b178b643d2da7ed9aaf738d232b4 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/rng_state_2.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..af98f0dfe2a5d89fbccf90df58246a0b078c7016 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9f5f3338a05e325b5408a1cd0b6f5e5b10fad05fe479d63f44bec4cf18107d6 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/rng_state_3.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..715aa4a4ee3915f810fc2bacb2153eb8a0913781 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1be749fea477a3867d44010631937e0d8f071ca5f9614f9795c92c7fa68833a6 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/rng_state_4.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..c7bde70899833455b6ee4a99aff9388abc5ffe92 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbc4a5ea4532c621f4c8e9891117b2e597a7f005001e8b4f2a1b4da8c82bf964 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/rng_state_5.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..90cdeaa2fe438098e9d95ddbc06c765e51af1e78 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:480f9fe7dd71b54d915b46162e34b780ba2467d5542115cc809dbca60b394c0e +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/rng_state_6.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..2bd30529614c5be239cd9477af6bef0e313740b6 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c11d982dcd813e82c2d97a5491ce9624cff2dd22e8655ea617ccef1fc1474470 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/rng_state_7.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..bed311094effd49cc2c89237c675f56eade157d1 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73494fac3a001cba7cedd097b97f028d4c1d136ee6709214b0a7fe305e5b9089 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/scheduler.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b08896e3e64039017a0606b43a6327f1f78848dc --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:826281cb7f404c3805b9798147d05074dd208eac748e2052087055a015aaeaed +size 1064 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/trainer_state.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..aef100c2eae75e8125d890e86f3cd69e4adfaa1f --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/trainer_state.json @@ -0,0 +1,941 @@ +{ + "best_metric": 0.49707031, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160", + "epoch": 4.0, + "eval_steps": 20, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 9.15866064284304, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": 2.390625, + "logits/rejected": -2.546875, + "logps/chosen": -520.0, + "logps/rejected": -142.0, + "loss": 2.4697265625, + "memory(GiB)": 8.4, + "nll_loss": 0.51171875, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.126463 + }, + { + "epoch": 0.1, + "grad_norm": 26.585759281524314, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 0.1533203125, + "logits/rejected": -0.8671875, + "logps/chosen": -728.0, + "logps/rejected": -304.0, + "loss": 2.3878173828125, + "memory(GiB)": 12.04, + "nll_loss": 1.1484375, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.125, + "rewards/margins": -0.0250244140625, + "rewards/rejected": 0.150390625, + "step": 5, + "train_speed(iter/s)": 0.276117 + }, + { + "epoch": 0.2, + "grad_norm": 8.664467296577401, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.419921875, + "logits/rejected": 1.2890625, + "logps/chosen": -644.0, + "logps/rejected": -572.0, + "loss": 2.18505859375, + "memory(GiB)": 17.87, + "nll_loss": 1.4921875, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.1103515625, + "rewards/margins": 0.08984375, + "rewards/rejected": 0.0201416015625, + "step": 10, + "train_speed(iter/s)": 0.313547 + }, + { + "epoch": 0.3, + "grad_norm": 6.325430646213694, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.7578125, + "logits/rejected": 1.8203125, + "logps/chosen": -924.0, + "logps/rejected": -1552.0, + "loss": 1.76845703125, + "memory(GiB)": 32.02, + "nll_loss": 1.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.75, + "rewards/margins": 2.390625, + "rewards/rejected": 0.359375, + "step": 15, + "train_speed(iter/s)": 0.318522 + }, + { + "epoch": 0.4, + "grad_norm": 2.351010309283972, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": 0.5859375, + "logits/rejected": -0.4375, + "logps/chosen": -432.0, + "logps/rejected": -324.0, + "loss": 1.172265625, + "memory(GiB)": 32.02, + "nll_loss": 0.6640625, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 4.65625, + "rewards/margins": 1.5546875, + "rewards/rejected": 3.09375, + "step": 20, + "train_speed(iter/s)": 0.331194 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.3125, + "eval_logits/rejected": 0.86328125, + "eval_logps/chosen": -14.3125, + "eval_logps/rejected": -182.0, + "eval_loss": 0.6728515625, + "eval_nll_loss": 0.62109375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.375, + "eval_rewards/margins": 3.5625, + "eval_rewards/rejected": 5.8125, + "eval_runtime": 1.2862, + "eval_samples_per_second": 3.11, + "eval_steps_per_second": 0.778, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.5984307422346672, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": 0.255859375, + "logits/rejected": -0.0390625, + "logps/chosen": -326.0, + "logps/rejected": -420.0, + "loss": 0.8047607421875, + "memory(GiB)": 32.02, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.125, + "rewards/margins": 5.40625, + "rewards/rejected": 3.6875, + "step": 25, + "train_speed(iter/s)": 0.330795 + }, + { + "epoch": 0.6, + "grad_norm": 0.5223734060204478, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -0.21875, + "logits/rejected": 1.546875, + "logps/chosen": -400.0, + "logps/rejected": -656.0, + "loss": 0.6901611328125, + "memory(GiB)": 32.02, + "nll_loss": 0.78125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.5625, + "rewards/margins": 6.78125, + "rewards/rejected": 3.765625, + "step": 30, + "train_speed(iter/s)": 0.339407 + }, + { + "epoch": 0.7, + "grad_norm": 1.0839186250481967, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": 1.5625, + "logits/rejected": -1.578125, + "logps/chosen": -592.0, + "logps/rejected": -322.0, + "loss": 0.7055419921875, + "memory(GiB)": 32.02, + "nll_loss": 0.96875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.75, + "rewards/margins": 9.625, + "rewards/rejected": 3.125, + "step": 35, + "train_speed(iter/s)": 0.346622 + }, + { + "epoch": 0.8, + "grad_norm": 13.025539300060226, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": 0.08740234375, + "logits/rejected": 0.71484375, + "logps/chosen": -225.0, + "logps/rejected": -322.0, + "loss": 0.6363037109375, + "memory(GiB)": 32.02, + "nll_loss": 0.56640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.0625, + "rewards/margins": 7.53125, + "rewards/rejected": 3.546875, + "step": 40, + "train_speed(iter/s)": 0.345794 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -0.8046875, + "eval_logits/rejected": 0.9375, + "eval_logps/chosen": -7.1875, + "eval_logps/rejected": -212.0, + "eval_loss": 0.5283203125, + "eval_nll_loss": 0.3125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 7.3125, + "eval_rewards/rejected": 2.796875, + "eval_runtime": 1.2905, + "eval_samples_per_second": 3.1, + "eval_steps_per_second": 0.775, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.49724108967226127, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": 2.890625, + "logits/rejected": 0.002349853515625, + "logps/chosen": -426.0, + "logps/rejected": -430.0, + "loss": 0.5437744140625, + "memory(GiB)": 32.02, + "nll_loss": 0.470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.0625, + "rewards/margins": 10.6875, + "rewards/rejected": 3.390625, + "step": 45, + "train_speed(iter/s)": 0.342553 + }, + { + "epoch": 1.0, + "grad_norm": 1.4155859018634052, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": 0.2451171875, + "logits/rejected": 1.75, + "logps/chosen": -804.0, + "logps/rejected": -1480.0, + "loss": 0.57803955078125, + "memory(GiB)": 32.02, + "nll_loss": 0.6953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.25, + "rewards/margins": 9.125, + "rewards/rejected": 7.125, + "step": 50, + "train_speed(iter/s)": 0.346029 + }, + { + "epoch": 1.1, + "grad_norm": 0.5694275670231188, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": 1.8359375, + "logits/rejected": 0.703125, + "logps/chosen": -386.0, + "logps/rejected": -612.0, + "loss": 0.6138671875, + "memory(GiB)": 43.66, + "nll_loss": 0.546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.75, + "rewards/margins": 12.25, + "rewards/rejected": 3.53125, + "step": 55, + "train_speed(iter/s)": 0.347089 + }, + { + "epoch": 1.2, + "grad_norm": 0.3472486708219598, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": 1.65625, + "logits/rejected": 2.15625, + "logps/chosen": -233.0, + "logps/rejected": -520.0, + "loss": 0.5795654296875, + "memory(GiB)": 43.66, + "nll_loss": 0.396484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.375, + "rewards/margins": 12.1875, + "rewards/rejected": 3.140625, + "step": 60, + "train_speed(iter/s)": 0.349928 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -0.23828125, + "eval_logits/rejected": 1.703125, + "eval_logps/chosen": -7.3125, + "eval_logps/rejected": -223.0, + "eval_loss": 0.51806640625, + "eval_nll_loss": 0.31640625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 8.4375, + "eval_rewards/rejected": 1.703125, + "eval_runtime": 1.2918, + "eval_samples_per_second": 3.096, + "eval_steps_per_second": 0.774, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.48830819519223906, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": 0.97265625, + "logits/rejected": 1.9921875, + "logps/chosen": -346.0, + "logps/rejected": -844.0, + "loss": 0.6193359375, + "memory(GiB)": 43.66, + "nll_loss": 0.640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5625, + "rewards/margins": 11.875, + "rewards/rejected": 3.703125, + "step": 65, + "train_speed(iter/s)": 0.34619 + }, + { + "epoch": 1.4, + "grad_norm": 0.3461001279478362, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -0.6875, + "logits/rejected": 2.28125, + "logps/chosen": -88.0, + "logps/rejected": -1120.0, + "loss": 0.5681396484375, + "memory(GiB)": 43.66, + "nll_loss": 0.84375, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 11.5625, + "rewards/margins": 9.125, + "rewards/rejected": 2.484375, + "step": 70, + "train_speed(iter/s)": 0.349389 + }, + { + "epoch": 1.5, + "grad_norm": 0.49738864814975586, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": 2.515625, + "logits/rejected": 0.263671875, + "logps/chosen": -462.0, + "logps/rejected": -434.0, + "loss": 0.517205810546875, + "memory(GiB)": 43.66, + "nll_loss": 0.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.625, + "rewards/margins": 16.375, + "rewards/rejected": 2.28125, + "step": 75, + "train_speed(iter/s)": 0.351345 + }, + { + "epoch": 1.6, + "grad_norm": 0.3498527388541882, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": 3.5625, + "logits/rejected": -1.296875, + "logps/chosen": -580.0, + "logps/rejected": -132.0, + "loss": 0.5629638671875, + "memory(GiB)": 43.66, + "nll_loss": 0.703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.5, + "rewards/margins": 15.6875, + "rewards/rejected": 5.84375, + "step": 80, + "train_speed(iter/s)": 0.353922 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -0.68359375, + "eval_logits/rejected": 0.3125, + "eval_logps/chosen": -6.3125, + "eval_logps/rejected": -306.0, + "eval_loss": 0.5009765625, + "eval_nll_loss": 0.2734375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.1875, + "eval_rewards/margins": 16.75, + "eval_rewards/rejected": -6.59375, + "eval_runtime": 1.3254, + "eval_samples_per_second": 3.018, + "eval_steps_per_second": 0.754, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.6649877164961882, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": 0.72265625, + "logits/rejected": 0.388671875, + "logps/chosen": -326.0, + "logps/rejected": -768.0, + "loss": 0.70977783203125, + "memory(GiB)": 43.66, + "nll_loss": 0.458984375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5, + "rewards/margins": 18.375, + "rewards/rejected": -2.859375, + "step": 85, + "train_speed(iter/s)": 0.351477 + }, + { + "epoch": 1.8, + "grad_norm": 0.5774391096810936, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": 2.15625, + "logits/rejected": 1.515625, + "logps/chosen": -290.0, + "logps/rejected": -520.0, + "loss": 0.5434326171875, + "memory(GiB)": 43.66, + "nll_loss": 0.56640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.75, + "rewards/margins": 14.25, + "rewards/rejected": 0.5390625, + "step": 90, + "train_speed(iter/s)": 0.351018 + }, + { + "epoch": 1.9, + "grad_norm": 0.46335079012099145, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": 1.65625, + "logits/rejected": 1.1640625, + "logps/chosen": -366.0, + "logps/rejected": -620.0, + "loss": 0.60682373046875, + "memory(GiB)": 43.66, + "nll_loss": 0.5, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.75, + "rewards/margins": 15.9375, + "rewards/rejected": 0.7890625, + "step": 95, + "train_speed(iter/s)": 0.351082 + }, + { + "epoch": 2.0, + "grad_norm": 0.44055183589508673, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": 0.59765625, + "logits/rejected": 0.1025390625, + "logps/chosen": -276.0, + "logps/rejected": -360.0, + "loss": 0.51856689453125, + "memory(GiB)": 43.66, + "nll_loss": 0.470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.6875, + "rewards/margins": 14.0, + "rewards/rejected": 1.65625, + "step": 100, + "train_speed(iter/s)": 0.350869 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -0.63671875, + "eval_logits/rejected": 0.66796875, + "eval_logps/chosen": -6.84375, + "eval_logps/rejected": -280.0, + "eval_loss": 0.50390625, + "eval_nll_loss": 0.298828125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 14.125, + "eval_rewards/rejected": -4.0, + "eval_runtime": 1.3254, + "eval_samples_per_second": 3.018, + "eval_steps_per_second": 0.754, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.43319830084665456, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": 3.5, + "logits/rejected": 0.185546875, + "logps/chosen": -442.0, + "logps/rejected": -580.0, + "loss": 0.540869140625, + "memory(GiB)": 43.66, + "nll_loss": 0.60546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.375, + "rewards/margins": 16.125, + "rewards/rejected": 4.25, + "step": 105, + "train_speed(iter/s)": 0.34793 + }, + { + "epoch": 2.2, + "grad_norm": 0.5087186205209226, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": 0.41015625, + "logits/rejected": 1.9375, + "logps/chosen": -688.0, + "logps/rejected": -1280.0, + "loss": 0.5629150390625, + "memory(GiB)": 43.66, + "nll_loss": 0.69140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.875, + "rewards/margins": 14.5, + "rewards/rejected": 6.34375, + "step": 110, + "train_speed(iter/s)": 0.34833 + }, + { + "epoch": 2.3, + "grad_norm": 0.30352973245628134, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": 1.546875, + "logits/rejected": 1.0625, + "logps/chosen": -568.0, + "logps/rejected": -704.0, + "loss": 0.596630859375, + "memory(GiB)": 43.66, + "nll_loss": 0.5390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.0, + "rewards/margins": 15.4375, + "rewards/rejected": 5.5, + "step": 115, + "train_speed(iter/s)": 0.349547 + }, + { + "epoch": 2.4, + "grad_norm": 0.5099680534446042, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": 2.40625, + "logits/rejected": 0.002349853515625, + "logps/chosen": -368.0, + "logps/rejected": -316.0, + "loss": 0.463055419921875, + "memory(GiB)": 43.66, + "nll_loss": 0.37109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.625, + "rewards/margins": 13.1875, + "rewards/rejected": 3.484375, + "step": 120, + "train_speed(iter/s)": 0.35126 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -0.96484375, + "eval_logits/rejected": 0.416015625, + "eval_logps/chosen": -6.4375, + "eval_logps/rejected": -280.0, + "eval_loss": 0.49755859375, + "eval_nll_loss": 0.279296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.1875, + "eval_rewards/margins": 14.1875, + "eval_rewards/rejected": -4.0, + "eval_runtime": 1.3007, + "eval_samples_per_second": 3.075, + "eval_steps_per_second": 0.769, + "step": 120 + }, + { + "epoch": 2.5, + "grad_norm": 0.6759212291558464, + "learning_rate": 5.430276272567485e-05, + "logits/chosen": -1.4140625, + "logits/rejected": 0.82421875, + "logps/chosen": -62.5, + "logps/rejected": -1288.0, + "loss": 0.5360504150390625, + "memory(GiB)": 43.66, + "nll_loss": 0.7265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.75, + "rewards/margins": 19.75, + "rewards/rejected": -7.96875, + "step": 125, + "train_speed(iter/s)": 0.350267 + }, + { + "epoch": 2.6, + "grad_norm": 0.3635118516286732, + "learning_rate": 5.0994109383253506e-05, + "logits/chosen": 2.3125, + "logits/rejected": 0.546875, + "logps/chosen": -348.0, + "logps/rejected": -314.0, + "loss": 0.4655029296875, + "memory(GiB)": 43.66, + "nll_loss": 0.484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.875, + "rewards/margins": 15.3125, + "rewards/rejected": 3.515625, + "step": 130, + "train_speed(iter/s)": 0.35164 + }, + { + "epoch": 2.7, + "grad_norm": 0.4546423377677983, + "learning_rate": 4.768109069909307e-05, + "logits/chosen": -0.375, + "logits/rejected": 0.88671875, + "logps/chosen": -157.0, + "logps/rejected": -624.0, + "loss": 0.44596214294433595, + "memory(GiB)": 43.66, + "nll_loss": 0.24609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.375, + "rewards/margins": 14.9375, + "rewards/rejected": -1.5625, + "step": 135, + "train_speed(iter/s)": 0.352504 + }, + { + "epoch": 2.8, + "grad_norm": 0.3766526906078867, + "learning_rate": 4.4378254829551396e-05, + "logits/chosen": 1.484375, + "logits/rejected": 1.1484375, + "logps/chosen": -392.0, + "logps/rejected": -708.0, + "loss": 0.4998992919921875, + "memory(GiB)": 43.66, + "nll_loss": 0.734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.625, + "rewards/margins": 17.0, + "rewards/rejected": 1.5546875, + "step": 140, + "train_speed(iter/s)": 0.353731 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -1.0546875, + "eval_logits/rejected": 0.28515625, + "eval_logps/chosen": -6.65625, + "eval_logps/rejected": -282.0, + "eval_loss": 0.49853515625, + "eval_nll_loss": 0.2890625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.1875, + "eval_rewards/margins": 14.375, + "eval_rewards/rejected": -4.1875, + "eval_runtime": 1.2976, + "eval_samples_per_second": 3.083, + "eval_steps_per_second": 0.771, + "step": 140 + }, + { + "epoch": 2.9, + "grad_norm": 0.7368475635982081, + "learning_rate": 4.11001052161225e-05, + "logits/chosen": 1.484375, + "logits/rejected": 0.7578125, + "logps/chosen": -624.0, + "logps/rejected": -664.0, + "loss": 0.5592437744140625, + "memory(GiB)": 43.66, + "nll_loss": 0.7421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.75, + "rewards/margins": 16.875, + "rewards/rejected": 4.90625, + "step": 145, + "train_speed(iter/s)": 0.350922 + }, + { + "epoch": 3.0, + "grad_norm": 0.526438371292198, + "learning_rate": 3.786103689779861e-05, + "logits/chosen": 2.53125, + "logits/rejected": -0.11962890625, + "logps/chosen": -406.0, + "logps/rejected": -310.0, + "loss": 0.4158843994140625, + "memory(GiB)": 43.66, + "nll_loss": 0.39453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.5, + "rewards/margins": 19.0, + "rewards/rejected": 3.5, + "step": 150, + "train_speed(iter/s)": 0.350374 + }, + { + "epoch": 3.1, + "grad_norm": 0.3832315044394523, + "learning_rate": 3.467527329945026e-05, + "logits/chosen": 1.1484375, + "logits/rejected": 1.328125, + "logps/chosen": -600.0, + "logps/rejected": -588.0, + "loss": 0.5336650848388672, + "memory(GiB)": 43.66, + "nll_loss": 0.515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.125, + "rewards/margins": 17.375, + "rewards/rejected": 1.7265625, + "step": 155, + "train_speed(iter/s)": 0.350973 + }, + { + "epoch": 3.2, + "grad_norm": 0.41804293149830996, + "learning_rate": 3.1556803773799614e-05, + "logits/chosen": 2.84375, + "logits/rejected": 0.10693359375, + "logps/chosen": -416.0, + "logps/rejected": -360.0, + "loss": 0.456103515625, + "memory(GiB)": 43.66, + "nll_loss": 0.38671875, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.5, + "rewards/margins": 16.75, + "rewards/rejected": 4.71875, + "step": 160, + "train_speed(iter/s)": 0.350912 + }, + { + "epoch": 3.2, + "eval_logits/chosen": -1.046875, + "eval_logits/rejected": 0.1669921875, + "eval_logps/chosen": -6.46875, + "eval_logps/rejected": -286.0, + "eval_loss": 0.4970703125, + "eval_nll_loss": 0.28125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.1875, + "eval_rewards/margins": 14.75, + "eval_rewards/rejected": -4.59375, + "eval_runtime": 1.2858, + "eval_samples_per_second": 3.111, + "eval_steps_per_second": 0.778, + "step": 160 + }, + { + "epoch": 3.3, + "grad_norm": 0.9977986802164632, + "learning_rate": 2.8519322171253602e-05, + "logits/chosen": -0.173828125, + "logits/rejected": 0.234375, + "logps/chosen": -145.0, + "logps/rejected": -498.0, + "loss": 0.4285240173339844, + "memory(GiB)": 43.66, + "nll_loss": 0.16796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.625, + "rewards/margins": 18.5, + "rewards/rejected": -1.7890625, + "step": 165, + "train_speed(iter/s)": 0.350263 + }, + { + "epoch": 3.4, + "grad_norm": 0.5073480559704879, + "learning_rate": 2.5576166707349385e-05, + "logits/chosen": 3.234375, + "logits/rejected": -0.92578125, + "logps/chosen": -640.0, + "logps/rejected": -207.0, + "loss": 0.45649566650390627, + "memory(GiB)": 43.66, + "nll_loss": 0.453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 25.0, + "rewards/margins": 19.125, + "rewards/rejected": 5.875, + "step": 170, + "train_speed(iter/s)": 0.35017 + }, + { + "epoch": 3.5, + "grad_norm": 0.5295452954524565, + "learning_rate": 2.2740261391866637e-05, + "logits/chosen": 0.498046875, + "logits/rejected": 0.392578125, + "logps/chosen": -466.0, + "logps/rejected": -592.0, + "loss": 0.39521121978759766, + "memory(GiB)": 43.66, + "nll_loss": 0.65625, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.75, + "rewards/margins": 14.75, + "rewards/rejected": 7.03125, + "step": 175, + "train_speed(iter/s)": 0.350866 + }, + { + "epoch": 3.6, + "grad_norm": 0.30567037033016853, + "learning_rate": 2.002405927680374e-05, + "logits/chosen": 3.09375, + "logits/rejected": -0.09326171875, + "logps/chosen": -482.0, + "logps/rejected": -262.0, + "loss": 0.500946044921875, + "memory(GiB)": 43.66, + "nll_loss": 0.546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.25, + "rewards/margins": 17.125, + "rewards/rejected": 4.125, + "step": 180, + "train_speed(iter/s)": 0.351966 + }, + { + "epoch": 3.6, + "eval_logits/chosen": -1.1640625, + "eval_logits/rejected": 0.060546875, + "eval_logps/chosen": -7.0, + "eval_logps/rejected": -284.0, + "eval_loss": 0.501953125, + "eval_nll_loss": 0.302734375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 14.5, + "eval_rewards/rejected": -4.40625, + "eval_runtime": 1.2632, + "eval_samples_per_second": 3.167, + "eval_steps_per_second": 0.792, + "step": 180 + }, + { + "epoch": 3.7, + "grad_norm": 0.9188395046874438, + "learning_rate": 1.743948777242814e-05, + "logits/chosen": 3.0, + "logits/rejected": -0.10009765625, + "logps/chosen": -536.0, + "logps/rejected": -229.0, + "loss": 0.4772364616394043, + "memory(GiB)": 43.66, + "nll_loss": 0.609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.25, + "rewards/margins": 16.875, + "rewards/rejected": 4.40625, + "step": 185, + "train_speed(iter/s)": 0.35126 + }, + { + "epoch": 3.8, + "grad_norm": 0.5288801717382393, + "learning_rate": 1.4997896271528739e-05, + "logits/chosen": 0.9765625, + "logits/rejected": 1.4140625, + "logps/chosen": -330.0, + "logps/rejected": -708.0, + "loss": 0.4629364013671875, + "memory(GiB)": 43.66, + "nll_loss": 0.6875, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.75, + "rewards/margins": 18.5, + "rewards/rejected": 1.34375, + "step": 190, + "train_speed(iter/s)": 0.352271 + }, + { + "epoch": 3.9, + "grad_norm": 0.5916934882939896, + "learning_rate": 1.2710006311864104e-05, + "logits/chosen": -0.31640625, + "logits/rejected": 0.78125, + "logps/chosen": -414.0, + "logps/rejected": -680.0, + "loss": 0.38446922302246095, + "memory(GiB)": 43.66, + "nll_loss": 0.435546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.75, + "rewards/margins": 20.0, + "rewards/rejected": 2.765625, + "step": 195, + "train_speed(iter/s)": 0.352064 + }, + { + "epoch": 4.0, + "grad_norm": 0.6513423080203622, + "learning_rate": 1.0585864495652897e-05, + "logits/chosen": 1.2578125, + "logits/rejected": 0.53125, + "logps/chosen": -414.0, + "logps/rejected": -728.0, + "loss": 0.43031768798828124, + "memory(GiB)": 56.51, + "nll_loss": 0.48046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.875, + "rewards/margins": 18.125, + "rewards/rejected": 3.78125, + "step": 200, + "train_speed(iter/s)": 0.351751 + }, + { + "epoch": 4.0, + "eval_logits/chosen": -1.265625, + "eval_logits/rejected": 0.0634765625, + "eval_logps/chosen": -7.5625, + "eval_logps/rejected": -286.0, + "eval_loss": 0.509765625, + "eval_nll_loss": 0.328125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 14.75, + "eval_rewards/rejected": -4.59375, + "eval_runtime": 1.2468, + "eval_samples_per_second": 3.208, + "eval_steps_per_second": 0.802, + "step": 200 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 175470621622272.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/training_args.bin b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..475462e40417630252a03e5a9352b5e11f000ed1 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:647dcffabf1a4c913f292b185978f2f179c4819a3df809536d284c803d9d97fe +size 8888 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/zero_to_fp32.py b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-200/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/README.md b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/README.md new file mode 100644 index 0000000000000000000000000000000000000000..525e0a2a5243ae77491e227cbf74fa8ff0a43579 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-7b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/adapter_config.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..454fa4ee7fc180fe11671ee7f6c90d0ba44328a6 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "gate_proj", + "up_proj", + "o_proj", + "v_proj", + "down_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/adapter_model.safetensors b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8369bd45fab4cdd702e3883761ac4cadf2897880 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ee768d44db7e84517472d094577443a14da0a4882749f20483506ed8c811704 +size 40422208 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/additional_config.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/args.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/args.json new file mode 100644 index 0000000000000000000000000000000000000000..4b016b97ef2d4589dae9d4623f25925cb829d241 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-7b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-7b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0c48defa1b0b2f3cc5eca149dcbb7f1fc6f26899 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b1b054caf1c57a37ab0331b783285fb38e4728fc156531ddc5fb067ae0c9021 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..79a7f19a4fceaa39fce1857ced64db3f933937d6 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b56ff0afd72a544a0969299a667ba6cc206fcd7beb2ff2e5417304a8368b7f2 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b130e89be8eacacaa465537b14d20d5dd7f08ac8 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f19547f4ede73e8d4805dee857afba03db04dedb2e94d79e4b3c7f9f151eff23 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c43e4c940506f9548b55941c65aef7d610df40c --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66998b0c3d8251a62bb2e41be02b6f8e93312d32c4475620cb3a6ed02d4df09a +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5fdde5aaaa53045e3840b749fec53ed50041b45a --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8847fbc8e7342e24b56c9aef8358126e8394feea82064d2f00a0006e7f852fc3 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3db9a39cbf75a3f6bc5e4c794e6b8b06e3b1571b --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6521e980e70d1ffa98d0deaf73ee79b41e97df0f92364081848ac07bf3960fa4 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..448eeb48c2f2e4a1c6f15969fb1926cbd6b4849e --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbaa56ced6f332186042fb7b347e9fde2335bf40c0334328ee7433f43355c189 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..83d8822eb5274856e7e127710cfc5bdb747aac68 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fed794bb98b668f0ab72bfa22a2d1b04603dce2c9e6cbb60fead7bba3066bdbb +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e7acb657bf491326de8742f99de24adc7917baf9 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8a5af401beefb559e4a04e59d23fec0afa13b80bdbe3bd7f6b80bf3969a7d62 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..86b1ff4823958a9d1a70b57ee735549f029c56b0 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bea00c7690d15b788991d9d49dfa7602df48b33bf632d02e3abc25c15b9ca35d +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d2732f0580aa4a93a6cb7feec18ab73dcefb5ed2 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7f9a7d3cc0eaf263c771043c16a68c30bc4b1382a0c61cf1e366581cb1f8dd0 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bea9b9dccb43ef649fb76780912fef12fa3631d9 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4156811b2575b7b89241945217ed57f72e805f152a03346fc94a46f3d9e4c969 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c2e31e483d3e2c176739650cc7b4d981aa8f7309 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b85283ff004175a36b351e8792ccb5d59271ed606a9ffd8358a2f8fa74ac657 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..af74272c2793ebc3911d5e4da5145d0d85879f97 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b4d153f6b6fdb05aecb7775dc68208af722a237b69078a0414bd751fffe3e7a +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..537bd8791fbe9cf1d00250b04adf9af0a01a5e71 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5fd244222ee08d684b5042212a977bd6979a0011d1c49faa35f1da630f53510 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..338118ae16f25177e8cd9b9fbdbda0319626e9c2 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/global_step220/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9855f576107ead524852a106bd492dc94ecceb10d882b1902ed9d71f2638ee44 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/latest b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/latest new file mode 100644 index 0000000000000000000000000000000000000000..c9ebe2709e7f014a6431e10a08b9ee83756b9b83 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/latest @@ -0,0 +1 @@ +global_step220 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/rng_state_0.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..331a542ef30cc221562b6a988bba872aca28732e --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb7c3bc1248de8b4739437317b988d953fd64a5de9736606d74f9c8277f1b485 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/rng_state_1.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d445f1a845bda18b54837a3234302870193ebea4 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8e571d57a85eb2cdabf3f46c86e446bdb7d26aba8b1467b5e4b5bbe29ad42a7 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/rng_state_2.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..3a1a5fda176cefd8a1f05e423f2c82ed9f2333bf --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:489e5542988617525a395c45dc83ec6bf25b473812e139122f0a3f3d92f031d0 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/rng_state_3.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..a7495a1bc89c5532615f548b4a177c4b6de82a0a --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cd77682efb711872c5be25e87e87a2726a2e7105422cddd00f04da7be35ca20 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/rng_state_4.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..a0dd539c338038495aec8fdc04c5e6d165086b28 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e44d9e7d535f5fbcd7cfef16ba22d32d5f445aacceba782a05df1f97d47a608a +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/rng_state_5.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..bd7cb309d087786d365a3ca391edef06504b3bb4 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a107290a0d9898930bc6abe369ee246ef7322541985fc2a5320e7775f5ea5c88 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/rng_state_6.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..3c760c81b8bffb4ba6cb4dcda4460911ef5e78df --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88ab49d56ee4079c2a208376064f825918f070addc8f0c58c5c594265f9e8a78 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/rng_state_7.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..62523a33304462480531f2f10d91dcdd14562719 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d15033d06420b17d80db45c89544170faa67833d5a0d9c30a51a38a1102b073 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/scheduler.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..fc26f1e85f4e8e85881b70bb37705b907a71e2da --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a192b6eaac6b92a2de7d039b2fc8b1f373bff6953e1e6a952189b56167078edd +size 1064 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/trainer_state.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7eb9cb7044bd10caa638bfeea191868fb5fcc7db --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/trainer_state.json @@ -0,0 +1,1030 @@ +{ + "best_metric": 0.49707031, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160", + "epoch": 4.4, + "eval_steps": 20, + "global_step": 220, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 9.15866064284304, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": 2.390625, + "logits/rejected": -2.546875, + "logps/chosen": -520.0, + "logps/rejected": -142.0, + "loss": 2.4697265625, + "memory(GiB)": 8.4, + "nll_loss": 0.51171875, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.126463 + }, + { + "epoch": 0.1, + "grad_norm": 26.585759281524314, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 0.1533203125, + "logits/rejected": -0.8671875, + "logps/chosen": -728.0, + "logps/rejected": -304.0, + "loss": 2.3878173828125, + "memory(GiB)": 12.04, + "nll_loss": 1.1484375, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.125, + "rewards/margins": -0.0250244140625, + "rewards/rejected": 0.150390625, + "step": 5, + "train_speed(iter/s)": 0.276117 + }, + { + "epoch": 0.2, + "grad_norm": 8.664467296577401, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.419921875, + "logits/rejected": 1.2890625, + "logps/chosen": -644.0, + "logps/rejected": -572.0, + "loss": 2.18505859375, + "memory(GiB)": 17.87, + "nll_loss": 1.4921875, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.1103515625, + "rewards/margins": 0.08984375, + "rewards/rejected": 0.0201416015625, + "step": 10, + "train_speed(iter/s)": 0.313547 + }, + { + "epoch": 0.3, + "grad_norm": 6.325430646213694, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.7578125, + "logits/rejected": 1.8203125, + "logps/chosen": -924.0, + "logps/rejected": -1552.0, + "loss": 1.76845703125, + "memory(GiB)": 32.02, + "nll_loss": 1.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.75, + "rewards/margins": 2.390625, + "rewards/rejected": 0.359375, + "step": 15, + "train_speed(iter/s)": 0.318522 + }, + { + "epoch": 0.4, + "grad_norm": 2.351010309283972, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": 0.5859375, + "logits/rejected": -0.4375, + "logps/chosen": -432.0, + "logps/rejected": -324.0, + "loss": 1.172265625, + "memory(GiB)": 32.02, + "nll_loss": 0.6640625, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 4.65625, + "rewards/margins": 1.5546875, + "rewards/rejected": 3.09375, + "step": 20, + "train_speed(iter/s)": 0.331194 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.3125, + "eval_logits/rejected": 0.86328125, + "eval_logps/chosen": -14.3125, + "eval_logps/rejected": -182.0, + "eval_loss": 0.6728515625, + "eval_nll_loss": 0.62109375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.375, + "eval_rewards/margins": 3.5625, + "eval_rewards/rejected": 5.8125, + "eval_runtime": 1.2862, + "eval_samples_per_second": 3.11, + "eval_steps_per_second": 0.778, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.5984307422346672, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": 0.255859375, + "logits/rejected": -0.0390625, + "logps/chosen": -326.0, + "logps/rejected": -420.0, + "loss": 0.8047607421875, + "memory(GiB)": 32.02, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.125, + "rewards/margins": 5.40625, + "rewards/rejected": 3.6875, + "step": 25, + "train_speed(iter/s)": 0.330795 + }, + { + "epoch": 0.6, + "grad_norm": 0.5223734060204478, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -0.21875, + "logits/rejected": 1.546875, + "logps/chosen": -400.0, + "logps/rejected": -656.0, + "loss": 0.6901611328125, + "memory(GiB)": 32.02, + "nll_loss": 0.78125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.5625, + "rewards/margins": 6.78125, + "rewards/rejected": 3.765625, + "step": 30, + "train_speed(iter/s)": 0.339407 + }, + { + "epoch": 0.7, + "grad_norm": 1.0839186250481967, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": 1.5625, + "logits/rejected": -1.578125, + "logps/chosen": -592.0, + "logps/rejected": -322.0, + "loss": 0.7055419921875, + "memory(GiB)": 32.02, + "nll_loss": 0.96875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.75, + "rewards/margins": 9.625, + "rewards/rejected": 3.125, + "step": 35, + "train_speed(iter/s)": 0.346622 + }, + { + "epoch": 0.8, + "grad_norm": 13.025539300060226, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": 0.08740234375, + "logits/rejected": 0.71484375, + "logps/chosen": -225.0, + "logps/rejected": -322.0, + "loss": 0.6363037109375, + "memory(GiB)": 32.02, + "nll_loss": 0.56640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.0625, + "rewards/margins": 7.53125, + "rewards/rejected": 3.546875, + "step": 40, + "train_speed(iter/s)": 0.345794 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -0.8046875, + "eval_logits/rejected": 0.9375, + "eval_logps/chosen": -7.1875, + "eval_logps/rejected": -212.0, + "eval_loss": 0.5283203125, + "eval_nll_loss": 0.3125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 7.3125, + "eval_rewards/rejected": 2.796875, + "eval_runtime": 1.2905, + "eval_samples_per_second": 3.1, + "eval_steps_per_second": 0.775, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.49724108967226127, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": 2.890625, + "logits/rejected": 0.002349853515625, + "logps/chosen": -426.0, + "logps/rejected": -430.0, + "loss": 0.5437744140625, + "memory(GiB)": 32.02, + "nll_loss": 0.470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.0625, + "rewards/margins": 10.6875, + "rewards/rejected": 3.390625, + "step": 45, + "train_speed(iter/s)": 0.342553 + }, + { + "epoch": 1.0, + "grad_norm": 1.4155859018634052, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": 0.2451171875, + "logits/rejected": 1.75, + "logps/chosen": -804.0, + "logps/rejected": -1480.0, + "loss": 0.57803955078125, + "memory(GiB)": 32.02, + "nll_loss": 0.6953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.25, + "rewards/margins": 9.125, + "rewards/rejected": 7.125, + "step": 50, + "train_speed(iter/s)": 0.346029 + }, + { + "epoch": 1.1, + "grad_norm": 0.5694275670231188, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": 1.8359375, + "logits/rejected": 0.703125, + "logps/chosen": -386.0, + "logps/rejected": -612.0, + "loss": 0.6138671875, + "memory(GiB)": 43.66, + "nll_loss": 0.546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.75, + "rewards/margins": 12.25, + "rewards/rejected": 3.53125, + "step": 55, + "train_speed(iter/s)": 0.347089 + }, + { + "epoch": 1.2, + "grad_norm": 0.3472486708219598, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": 1.65625, + "logits/rejected": 2.15625, + "logps/chosen": -233.0, + "logps/rejected": -520.0, + "loss": 0.5795654296875, + "memory(GiB)": 43.66, + "nll_loss": 0.396484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.375, + "rewards/margins": 12.1875, + "rewards/rejected": 3.140625, + "step": 60, + "train_speed(iter/s)": 0.349928 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -0.23828125, + "eval_logits/rejected": 1.703125, + "eval_logps/chosen": -7.3125, + "eval_logps/rejected": -223.0, + "eval_loss": 0.51806640625, + "eval_nll_loss": 0.31640625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 8.4375, + "eval_rewards/rejected": 1.703125, + "eval_runtime": 1.2918, + "eval_samples_per_second": 3.096, + "eval_steps_per_second": 0.774, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.48830819519223906, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": 0.97265625, + "logits/rejected": 1.9921875, + "logps/chosen": -346.0, + "logps/rejected": -844.0, + "loss": 0.6193359375, + "memory(GiB)": 43.66, + "nll_loss": 0.640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5625, + "rewards/margins": 11.875, + "rewards/rejected": 3.703125, + "step": 65, + "train_speed(iter/s)": 0.34619 + }, + { + "epoch": 1.4, + "grad_norm": 0.3461001279478362, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -0.6875, + "logits/rejected": 2.28125, + "logps/chosen": -88.0, + "logps/rejected": -1120.0, + "loss": 0.5681396484375, + "memory(GiB)": 43.66, + "nll_loss": 0.84375, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 11.5625, + "rewards/margins": 9.125, + "rewards/rejected": 2.484375, + "step": 70, + "train_speed(iter/s)": 0.349389 + }, + { + "epoch": 1.5, + "grad_norm": 0.49738864814975586, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": 2.515625, + "logits/rejected": 0.263671875, + "logps/chosen": -462.0, + "logps/rejected": -434.0, + "loss": 0.517205810546875, + "memory(GiB)": 43.66, + "nll_loss": 0.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.625, + "rewards/margins": 16.375, + "rewards/rejected": 2.28125, + "step": 75, + "train_speed(iter/s)": 0.351345 + }, + { + "epoch": 1.6, + "grad_norm": 0.3498527388541882, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": 3.5625, + "logits/rejected": -1.296875, + "logps/chosen": -580.0, + "logps/rejected": -132.0, + "loss": 0.5629638671875, + "memory(GiB)": 43.66, + "nll_loss": 0.703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.5, + "rewards/margins": 15.6875, + "rewards/rejected": 5.84375, + "step": 80, + "train_speed(iter/s)": 0.353922 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -0.68359375, + "eval_logits/rejected": 0.3125, + "eval_logps/chosen": -6.3125, + "eval_logps/rejected": -306.0, + "eval_loss": 0.5009765625, + "eval_nll_loss": 0.2734375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.1875, + "eval_rewards/margins": 16.75, + "eval_rewards/rejected": -6.59375, + "eval_runtime": 1.3254, + "eval_samples_per_second": 3.018, + "eval_steps_per_second": 0.754, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.6649877164961882, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": 0.72265625, + "logits/rejected": 0.388671875, + "logps/chosen": -326.0, + "logps/rejected": -768.0, + "loss": 0.70977783203125, + "memory(GiB)": 43.66, + "nll_loss": 0.458984375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5, + "rewards/margins": 18.375, + "rewards/rejected": -2.859375, + "step": 85, + "train_speed(iter/s)": 0.351477 + }, + { + "epoch": 1.8, + "grad_norm": 0.5774391096810936, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": 2.15625, + "logits/rejected": 1.515625, + "logps/chosen": -290.0, + "logps/rejected": -520.0, + "loss": 0.5434326171875, + "memory(GiB)": 43.66, + "nll_loss": 0.56640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.75, + "rewards/margins": 14.25, + "rewards/rejected": 0.5390625, + "step": 90, + "train_speed(iter/s)": 0.351018 + }, + { + "epoch": 1.9, + "grad_norm": 0.46335079012099145, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": 1.65625, + "logits/rejected": 1.1640625, + "logps/chosen": -366.0, + "logps/rejected": -620.0, + "loss": 0.60682373046875, + "memory(GiB)": 43.66, + "nll_loss": 0.5, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.75, + "rewards/margins": 15.9375, + "rewards/rejected": 0.7890625, + "step": 95, + "train_speed(iter/s)": 0.351082 + }, + { + "epoch": 2.0, + "grad_norm": 0.44055183589508673, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": 0.59765625, + "logits/rejected": 0.1025390625, + "logps/chosen": -276.0, + "logps/rejected": -360.0, + "loss": 0.51856689453125, + "memory(GiB)": 43.66, + "nll_loss": 0.470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.6875, + "rewards/margins": 14.0, + "rewards/rejected": 1.65625, + "step": 100, + "train_speed(iter/s)": 0.350869 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -0.63671875, + "eval_logits/rejected": 0.66796875, + "eval_logps/chosen": -6.84375, + "eval_logps/rejected": -280.0, + "eval_loss": 0.50390625, + "eval_nll_loss": 0.298828125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 14.125, + "eval_rewards/rejected": -4.0, + "eval_runtime": 1.3254, + "eval_samples_per_second": 3.018, + "eval_steps_per_second": 0.754, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.43319830084665456, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": 3.5, + "logits/rejected": 0.185546875, + "logps/chosen": -442.0, + "logps/rejected": -580.0, + "loss": 0.540869140625, + "memory(GiB)": 43.66, + "nll_loss": 0.60546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.375, + "rewards/margins": 16.125, + "rewards/rejected": 4.25, + "step": 105, + "train_speed(iter/s)": 0.34793 + }, + { + "epoch": 2.2, + "grad_norm": 0.5087186205209226, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": 0.41015625, + "logits/rejected": 1.9375, + "logps/chosen": -688.0, + "logps/rejected": -1280.0, + "loss": 0.5629150390625, + "memory(GiB)": 43.66, + "nll_loss": 0.69140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.875, + "rewards/margins": 14.5, + "rewards/rejected": 6.34375, + "step": 110, + "train_speed(iter/s)": 0.34833 + }, + { + "epoch": 2.3, + "grad_norm": 0.30352973245628134, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": 1.546875, + "logits/rejected": 1.0625, + "logps/chosen": -568.0, + "logps/rejected": -704.0, + "loss": 0.596630859375, + "memory(GiB)": 43.66, + "nll_loss": 0.5390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.0, + "rewards/margins": 15.4375, + "rewards/rejected": 5.5, + "step": 115, + "train_speed(iter/s)": 0.349547 + }, + { + "epoch": 2.4, + "grad_norm": 0.5099680534446042, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": 2.40625, + "logits/rejected": 0.002349853515625, + "logps/chosen": -368.0, + "logps/rejected": -316.0, + "loss": 0.463055419921875, + "memory(GiB)": 43.66, + "nll_loss": 0.37109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.625, + "rewards/margins": 13.1875, + "rewards/rejected": 3.484375, + "step": 120, + "train_speed(iter/s)": 0.35126 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -0.96484375, + "eval_logits/rejected": 0.416015625, + "eval_logps/chosen": -6.4375, + "eval_logps/rejected": -280.0, + "eval_loss": 0.49755859375, + "eval_nll_loss": 0.279296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.1875, + "eval_rewards/margins": 14.1875, + "eval_rewards/rejected": -4.0, + "eval_runtime": 1.3007, + "eval_samples_per_second": 3.075, + "eval_steps_per_second": 0.769, + "step": 120 + }, + { + "epoch": 2.5, + "grad_norm": 0.6759212291558464, + "learning_rate": 5.430276272567485e-05, + "logits/chosen": -1.4140625, + "logits/rejected": 0.82421875, + "logps/chosen": -62.5, + "logps/rejected": -1288.0, + "loss": 0.5360504150390625, + "memory(GiB)": 43.66, + "nll_loss": 0.7265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.75, + "rewards/margins": 19.75, + "rewards/rejected": -7.96875, + "step": 125, + "train_speed(iter/s)": 0.350267 + }, + { + "epoch": 2.6, + "grad_norm": 0.3635118516286732, + "learning_rate": 5.0994109383253506e-05, + "logits/chosen": 2.3125, + "logits/rejected": 0.546875, + "logps/chosen": -348.0, + "logps/rejected": -314.0, + "loss": 0.4655029296875, + "memory(GiB)": 43.66, + "nll_loss": 0.484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.875, + "rewards/margins": 15.3125, + "rewards/rejected": 3.515625, + "step": 130, + "train_speed(iter/s)": 0.35164 + }, + { + "epoch": 2.7, + "grad_norm": 0.4546423377677983, + "learning_rate": 4.768109069909307e-05, + "logits/chosen": -0.375, + "logits/rejected": 0.88671875, + "logps/chosen": -157.0, + "logps/rejected": -624.0, + "loss": 0.44596214294433595, + "memory(GiB)": 43.66, + "nll_loss": 0.24609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.375, + "rewards/margins": 14.9375, + "rewards/rejected": -1.5625, + "step": 135, + "train_speed(iter/s)": 0.352504 + }, + { + "epoch": 2.8, + "grad_norm": 0.3766526906078867, + "learning_rate": 4.4378254829551396e-05, + "logits/chosen": 1.484375, + "logits/rejected": 1.1484375, + "logps/chosen": -392.0, + "logps/rejected": -708.0, + "loss": 0.4998992919921875, + "memory(GiB)": 43.66, + "nll_loss": 0.734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.625, + "rewards/margins": 17.0, + "rewards/rejected": 1.5546875, + "step": 140, + "train_speed(iter/s)": 0.353731 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -1.0546875, + "eval_logits/rejected": 0.28515625, + "eval_logps/chosen": -6.65625, + "eval_logps/rejected": -282.0, + "eval_loss": 0.49853515625, + "eval_nll_loss": 0.2890625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.1875, + "eval_rewards/margins": 14.375, + "eval_rewards/rejected": -4.1875, + "eval_runtime": 1.2976, + "eval_samples_per_second": 3.083, + "eval_steps_per_second": 0.771, + "step": 140 + }, + { + "epoch": 2.9, + "grad_norm": 0.7368475635982081, + "learning_rate": 4.11001052161225e-05, + "logits/chosen": 1.484375, + "logits/rejected": 0.7578125, + "logps/chosen": -624.0, + "logps/rejected": -664.0, + "loss": 0.5592437744140625, + "memory(GiB)": 43.66, + "nll_loss": 0.7421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.75, + "rewards/margins": 16.875, + "rewards/rejected": 4.90625, + "step": 145, + "train_speed(iter/s)": 0.350922 + }, + { + "epoch": 3.0, + "grad_norm": 0.526438371292198, + "learning_rate": 3.786103689779861e-05, + "logits/chosen": 2.53125, + "logits/rejected": -0.11962890625, + "logps/chosen": -406.0, + "logps/rejected": -310.0, + "loss": 0.4158843994140625, + "memory(GiB)": 43.66, + "nll_loss": 0.39453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.5, + "rewards/margins": 19.0, + "rewards/rejected": 3.5, + "step": 150, + "train_speed(iter/s)": 0.350374 + }, + { + "epoch": 3.1, + "grad_norm": 0.3832315044394523, + "learning_rate": 3.467527329945026e-05, + "logits/chosen": 1.1484375, + "logits/rejected": 1.328125, + "logps/chosen": -600.0, + "logps/rejected": -588.0, + "loss": 0.5336650848388672, + "memory(GiB)": 43.66, + "nll_loss": 0.515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.125, + "rewards/margins": 17.375, + "rewards/rejected": 1.7265625, + "step": 155, + "train_speed(iter/s)": 0.350973 + }, + { + "epoch": 3.2, + "grad_norm": 0.41804293149830996, + "learning_rate": 3.1556803773799614e-05, + "logits/chosen": 2.84375, + "logits/rejected": 0.10693359375, + "logps/chosen": -416.0, + "logps/rejected": -360.0, + "loss": 0.456103515625, + "memory(GiB)": 43.66, + "nll_loss": 0.38671875, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.5, + "rewards/margins": 16.75, + "rewards/rejected": 4.71875, + "step": 160, + "train_speed(iter/s)": 0.350912 + }, + { + "epoch": 3.2, + "eval_logits/chosen": -1.046875, + "eval_logits/rejected": 0.1669921875, + "eval_logps/chosen": -6.46875, + "eval_logps/rejected": -286.0, + "eval_loss": 0.4970703125, + "eval_nll_loss": 0.28125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.1875, + "eval_rewards/margins": 14.75, + "eval_rewards/rejected": -4.59375, + "eval_runtime": 1.2858, + "eval_samples_per_second": 3.111, + "eval_steps_per_second": 0.778, + "step": 160 + }, + { + "epoch": 3.3, + "grad_norm": 0.9977986802164632, + "learning_rate": 2.8519322171253602e-05, + "logits/chosen": -0.173828125, + "logits/rejected": 0.234375, + "logps/chosen": -145.0, + "logps/rejected": -498.0, + "loss": 0.4285240173339844, + "memory(GiB)": 43.66, + "nll_loss": 0.16796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.625, + "rewards/margins": 18.5, + "rewards/rejected": -1.7890625, + "step": 165, + "train_speed(iter/s)": 0.350263 + }, + { + "epoch": 3.4, + "grad_norm": 0.5073480559704879, + "learning_rate": 2.5576166707349385e-05, + "logits/chosen": 3.234375, + "logits/rejected": -0.92578125, + "logps/chosen": -640.0, + "logps/rejected": -207.0, + "loss": 0.45649566650390627, + "memory(GiB)": 43.66, + "nll_loss": 0.453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 25.0, + "rewards/margins": 19.125, + "rewards/rejected": 5.875, + "step": 170, + "train_speed(iter/s)": 0.35017 + }, + { + "epoch": 3.5, + "grad_norm": 0.5295452954524565, + "learning_rate": 2.2740261391866637e-05, + "logits/chosen": 0.498046875, + "logits/rejected": 0.392578125, + "logps/chosen": -466.0, + "logps/rejected": -592.0, + "loss": 0.39521121978759766, + "memory(GiB)": 43.66, + "nll_loss": 0.65625, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.75, + "rewards/margins": 14.75, + "rewards/rejected": 7.03125, + "step": 175, + "train_speed(iter/s)": 0.350866 + }, + { + "epoch": 3.6, + "grad_norm": 0.30567037033016853, + "learning_rate": 2.002405927680374e-05, + "logits/chosen": 3.09375, + "logits/rejected": -0.09326171875, + "logps/chosen": -482.0, + "logps/rejected": -262.0, + "loss": 0.500946044921875, + "memory(GiB)": 43.66, + "nll_loss": 0.546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.25, + "rewards/margins": 17.125, + "rewards/rejected": 4.125, + "step": 180, + "train_speed(iter/s)": 0.351966 + }, + { + "epoch": 3.6, + "eval_logits/chosen": -1.1640625, + "eval_logits/rejected": 0.060546875, + "eval_logps/chosen": -7.0, + "eval_logps/rejected": -284.0, + "eval_loss": 0.501953125, + "eval_nll_loss": 0.302734375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 14.5, + "eval_rewards/rejected": -4.40625, + "eval_runtime": 1.2632, + "eval_samples_per_second": 3.167, + "eval_steps_per_second": 0.792, + "step": 180 + }, + { + "epoch": 3.7, + "grad_norm": 0.9188395046874438, + "learning_rate": 1.743948777242814e-05, + "logits/chosen": 3.0, + "logits/rejected": -0.10009765625, + "logps/chosen": -536.0, + "logps/rejected": -229.0, + "loss": 0.4772364616394043, + "memory(GiB)": 43.66, + "nll_loss": 0.609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.25, + "rewards/margins": 16.875, + "rewards/rejected": 4.40625, + "step": 185, + "train_speed(iter/s)": 0.35126 + }, + { + "epoch": 3.8, + "grad_norm": 0.5288801717382393, + "learning_rate": 1.4997896271528739e-05, + "logits/chosen": 0.9765625, + "logits/rejected": 1.4140625, + "logps/chosen": -330.0, + "logps/rejected": -708.0, + "loss": 0.4629364013671875, + "memory(GiB)": 43.66, + "nll_loss": 0.6875, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.75, + "rewards/margins": 18.5, + "rewards/rejected": 1.34375, + "step": 190, + "train_speed(iter/s)": 0.352271 + }, + { + "epoch": 3.9, + "grad_norm": 0.5916934882939896, + "learning_rate": 1.2710006311864104e-05, + "logits/chosen": -0.31640625, + "logits/rejected": 0.78125, + "logps/chosen": -414.0, + "logps/rejected": -680.0, + "loss": 0.38446922302246095, + "memory(GiB)": 43.66, + "nll_loss": 0.435546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.75, + "rewards/margins": 20.0, + "rewards/rejected": 2.765625, + "step": 195, + "train_speed(iter/s)": 0.352064 + }, + { + "epoch": 4.0, + "grad_norm": 0.6513423080203622, + "learning_rate": 1.0585864495652897e-05, + "logits/chosen": 1.2578125, + "logits/rejected": 0.53125, + "logps/chosen": -414.0, + "logps/rejected": -728.0, + "loss": 0.43031768798828124, + "memory(GiB)": 56.51, + "nll_loss": 0.48046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.875, + "rewards/margins": 18.125, + "rewards/rejected": 3.78125, + "step": 200, + "train_speed(iter/s)": 0.351751 + }, + { + "epoch": 4.0, + "eval_logits/chosen": -1.265625, + "eval_logits/rejected": 0.0634765625, + "eval_logps/chosen": -7.5625, + "eval_logps/rejected": -286.0, + "eval_loss": 0.509765625, + "eval_nll_loss": 0.328125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 14.75, + "eval_rewards/rejected": -4.59375, + "eval_runtime": 1.2468, + "eval_samples_per_second": 3.208, + "eval_steps_per_second": 0.802, + "step": 200 + }, + { + "epoch": 4.1, + "grad_norm": 0.46757622724638764, + "learning_rate": 8.634798372847148e-06, + "logits/chosen": 0.8984375, + "logits/rejected": 1.0546875, + "logps/chosen": -428.0, + "logps/rejected": -704.0, + "loss": 0.4748867034912109, + "memory(GiB)": 56.51, + "nll_loss": 0.4375, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.5, + "rewards/margins": 16.125, + "rewards/rejected": 5.40625, + "step": 205, + "train_speed(iter/s)": 0.35013 + }, + { + "epoch": 4.2, + "grad_norm": 0.5521912104468201, + "learning_rate": 6.865375481914016e-06, + "logits/chosen": 2.734375, + "logits/rejected": -0.33984375, + "logps/chosen": -274.0, + "logps/rejected": -230.0, + "loss": 0.42456893920898436, + "memory(GiB)": 56.51, + "nll_loss": 0.34375, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.125, + "rewards/margins": 16.375, + "rewards/rejected": 3.71875, + "step": 210, + "train_speed(iter/s)": 0.350785 + }, + { + "epoch": 4.3, + "grad_norm": 0.6123257585261611, + "learning_rate": 5.285365727986707e-06, + "logits/chosen": -1.28125, + "logits/rejected": 1.8359375, + "logps/chosen": -211.0, + "logps/rejected": -1216.0, + "loss": 0.42719383239746095, + "memory(GiB)": 56.51, + "nll_loss": 0.326171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.0, + "rewards/margins": 18.25, + "rewards/rejected": 0.8046875, + "step": 215, + "train_speed(iter/s)": 0.35093 + }, + { + "epoch": 4.4, + "grad_norm": 0.3529237641617113, + "learning_rate": 3.901707263589671e-06, + "logits/chosen": -1.171875, + "logits/rejected": 1.640625, + "logps/chosen": -135.0, + "logps/rejected": -984.0, + "loss": 0.3799613952636719, + "memory(GiB)": 56.51, + "nll_loss": 0.236328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.25, + "rewards/margins": 18.375, + "rewards/rejected": -2.078125, + "step": 220, + "train_speed(iter/s)": 0.350899 + }, + { + "epoch": 4.4, + "eval_logits/chosen": -1.3125, + "eval_logits/rejected": 0.03662109375, + "eval_logps/chosen": -7.9375, + "eval_logps/rejected": -290.0, + "eval_loss": 0.51318359375, + "eval_nll_loss": 0.345703125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.0625, + "eval_rewards/margins": 15.0625, + "eval_rewards/rejected": -5.0, + "eval_runtime": 1.3289, + "eval_samples_per_second": 3.01, + "eval_steps_per_second": 0.753, + "step": 220 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 193792278134784.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/training_args.bin b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..475462e40417630252a03e5a9352b5e11f000ed1 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:647dcffabf1a4c913f292b185978f2f179c4819a3df809536d284c803d9d97fe +size 8888 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/zero_to_fp32.py b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-220/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/README.md b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/README.md new file mode 100644 index 0000000000000000000000000000000000000000..525e0a2a5243ae77491e227cbf74fa8ff0a43579 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-7b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/adapter_config.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..454fa4ee7fc180fe11671ee7f6c90d0ba44328a6 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "gate_proj", + "up_proj", + "o_proj", + "v_proj", + "down_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/adapter_model.safetensors b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a283fb181691f4aa4fbc6857c19f2cc6927c6157 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4e665a46cff0e2c5aff02a341b72084c5864e713130779efae056b90a49db8b +size 40422208 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/additional_config.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/args.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/args.json new file mode 100644 index 0000000000000000000000000000000000000000..4b016b97ef2d4589dae9d4623f25925cb829d241 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-7b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-7b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dbc605e1b3a693f504bcaa34227962c16c0673f9 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77e6ecdcf3b1d9e97dcbcb11ec66ce07c8d50212f3a92dfdb8b29b4302d9ab6f +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dd7e1f6833c5d8a57d9647025ce68b4dc603a216 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bffff17b95c0cb2ab003bd98d2b8019d51a95277033e287acdc265fc7528360 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..319573d90a6862856ed63112c851fbf94dd78454 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:478d63fe59e146c64a364727e6e12625ecb752129da72d3ba65ef58cb1ebf6d5 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..75a1e3f45d41e27ff934f5457247f115597ff919 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77e1dfe4eef87dab0cba4567a9f04603d40b2cbb224801b429dd5b7fd38436e7 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f462392884f4c832f4d5cc08f2abe4e3e55bbff --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe1ad55c65965c59fe6b81c8dd0d06a68ddeba5d91b19a39bcdd27de1e4ad72f +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..103a1bc21e1e9f69d55196911ef2cbd31565b343 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f49a61d0443592a36e48e80016827a06f75d94ac070222f7badde884e5a5ae83 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..97baa10fc6db201623bcfa4d7327538ce15a315d --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:441fb5e15319810eacf19337733c8a3b6ec6adaee65f7135b7a0432d9eec0769 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0d57b34d2dc5f2a5591aabfcc4e1fffb07238212 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:247c4bdb6bc6d6a710b102f7f04f3a06228a0565f8009eea58689fb4a4097422 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..468cb91ab578eb4692020459320d35953c1a825e --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:689e67c9b520c73f61d1c09c4dc3c0cc16cdd6d82b4aec21de1164fd7162e3c9 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..74200e67e5cf08b9ab0d144bfd1a29bd3a7519e5 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f78ce9eda781f635f013d47134b8be6dc57599dafd5d9a1ebbc3f31d9652994 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..23c7f9a49789ec93975e710f25dfccdfc0504a9a --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4a35fbe62bdcb76f73248f8da51dfc3d749b86aea6891582321dcff04f372a7 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d36ed7d9063edd23a7d73cddb5be8bcc501a3c91 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52cbddc57bbc171f1c371eaa31be85d936ed2b3170268644845949fbbee282f6 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e677f313c46495aea86948674d2bacf68bbab16c --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d71a712d5e691b10668d4cdd7b58cb1da045cfa4f3990317f411086c016d573 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bcd27a070312168bccb8c6d60993367e464bcc6a --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22449c2965eb9d73c0485168a095198783a90d99d7afadc4a5025e3811327b52 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..23e428a59a582b0a5173fbb30cc8c855813aaa39 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55d6c074be0c279eee265f9286da0e81247558d8630fafdd7a605f7093905625 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5ce1b1c4819172842679c3e9b91e5311eff56409 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/global_step240/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce4377fb0e47f8d6f5b86d5ce55bd211234ce548571f83b3419d3b4a4d65d82b +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/latest b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/latest new file mode 100644 index 0000000000000000000000000000000000000000..161e63cf7292b2184098d115f0621d2ed09e86c5 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/latest @@ -0,0 +1 @@ +global_step240 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/rng_state_0.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..d3a6ea45dd4e59b9683f66476f460fa0c77a9d66 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a0c9979566a5d89cb3c766336548670ec6f2291deba1b7ab1764c12d3187b24 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/rng_state_1.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..42e6b0d6985c9b3f0cec701759e0b3d671c77abd --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03e36a570d6158fc25d1cf5d9f8f450fc64c5a7683330277f89ff76d5f2fc6cd +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/rng_state_2.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..376994a32199299a2a48b62753947cdb1f7ad72a --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4f619cbef4b74f1680d667c8788285a602392e63bdf3760ef3a59ec8864d483 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/rng_state_3.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..6f1edb2dfec55e5cbead7ae3d14351c3650c4f77 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fc037fba93ace1bf7ce01b1a5f7d785698d47b4cc2cedf2300bbf7a41ebf05c +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/rng_state_4.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..016d34db4ec6597c207021d026234c9692c3f3ad --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ab728c2461d6d1c64f04d7cbfdfcbfa7bd7ad0ef6e19d52458501ee81b27128 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/rng_state_5.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..5d7824c2bd9e8b1cec7f0d84d673017b0da62e43 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27530e653ebf5997ae3159cdcde264607e6a6f86b7e3c7a1b3a1e8301cd43d03 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/rng_state_6.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..8f41ee261ad98d2d0eb8f09167a5b32604513b56 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1fddaeb1257697bd7c0101abf1ab23f2925d0d9165cd8bddfbd22f8444db2b7 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/rng_state_7.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..d8816834cc1c0e822e11a8df138fa41557f3a0fb --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:942af3734a320fe12a3205a47ca1cdc7d1f0996bfde86c020a35545ccd2fd418 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/scheduler.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ce5faf9896aeadd65d47acddb4b510a6fc3c65f --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3a46b33bfe1e26ebea81904070b93f8e7376ae49add370042b1998521eed8ba +size 1064 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/trainer_state.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..dcd4f82678680b5d8af3a23f2427fe2e7b6f6770 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/trainer_state.json @@ -0,0 +1,1119 @@ +{ + "best_metric": 0.49707031, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160", + "epoch": 4.8, + "eval_steps": 20, + "global_step": 240, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 9.15866064284304, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": 2.390625, + "logits/rejected": -2.546875, + "logps/chosen": -520.0, + "logps/rejected": -142.0, + "loss": 2.4697265625, + "memory(GiB)": 8.4, + "nll_loss": 0.51171875, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.126463 + }, + { + "epoch": 0.1, + "grad_norm": 26.585759281524314, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 0.1533203125, + "logits/rejected": -0.8671875, + "logps/chosen": -728.0, + "logps/rejected": -304.0, + "loss": 2.3878173828125, + "memory(GiB)": 12.04, + "nll_loss": 1.1484375, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.125, + "rewards/margins": -0.0250244140625, + "rewards/rejected": 0.150390625, + "step": 5, + "train_speed(iter/s)": 0.276117 + }, + { + "epoch": 0.2, + "grad_norm": 8.664467296577401, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.419921875, + "logits/rejected": 1.2890625, + "logps/chosen": -644.0, + "logps/rejected": -572.0, + "loss": 2.18505859375, + "memory(GiB)": 17.87, + "nll_loss": 1.4921875, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.1103515625, + "rewards/margins": 0.08984375, + "rewards/rejected": 0.0201416015625, + "step": 10, + "train_speed(iter/s)": 0.313547 + }, + { + "epoch": 0.3, + "grad_norm": 6.325430646213694, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.7578125, + "logits/rejected": 1.8203125, + "logps/chosen": -924.0, + "logps/rejected": -1552.0, + "loss": 1.76845703125, + "memory(GiB)": 32.02, + "nll_loss": 1.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.75, + "rewards/margins": 2.390625, + "rewards/rejected": 0.359375, + "step": 15, + "train_speed(iter/s)": 0.318522 + }, + { + "epoch": 0.4, + "grad_norm": 2.351010309283972, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": 0.5859375, + "logits/rejected": -0.4375, + "logps/chosen": -432.0, + "logps/rejected": -324.0, + "loss": 1.172265625, + "memory(GiB)": 32.02, + "nll_loss": 0.6640625, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 4.65625, + "rewards/margins": 1.5546875, + "rewards/rejected": 3.09375, + "step": 20, + "train_speed(iter/s)": 0.331194 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.3125, + "eval_logits/rejected": 0.86328125, + "eval_logps/chosen": -14.3125, + "eval_logps/rejected": -182.0, + "eval_loss": 0.6728515625, + "eval_nll_loss": 0.62109375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.375, + "eval_rewards/margins": 3.5625, + "eval_rewards/rejected": 5.8125, + "eval_runtime": 1.2862, + "eval_samples_per_second": 3.11, + "eval_steps_per_second": 0.778, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.5984307422346672, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": 0.255859375, + "logits/rejected": -0.0390625, + "logps/chosen": -326.0, + "logps/rejected": -420.0, + "loss": 0.8047607421875, + "memory(GiB)": 32.02, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.125, + "rewards/margins": 5.40625, + "rewards/rejected": 3.6875, + "step": 25, + "train_speed(iter/s)": 0.330795 + }, + { + "epoch": 0.6, + "grad_norm": 0.5223734060204478, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -0.21875, + "logits/rejected": 1.546875, + "logps/chosen": -400.0, + "logps/rejected": -656.0, + "loss": 0.6901611328125, + "memory(GiB)": 32.02, + "nll_loss": 0.78125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.5625, + "rewards/margins": 6.78125, + "rewards/rejected": 3.765625, + "step": 30, + "train_speed(iter/s)": 0.339407 + }, + { + "epoch": 0.7, + "grad_norm": 1.0839186250481967, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": 1.5625, + "logits/rejected": -1.578125, + "logps/chosen": -592.0, + "logps/rejected": -322.0, + "loss": 0.7055419921875, + "memory(GiB)": 32.02, + "nll_loss": 0.96875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.75, + "rewards/margins": 9.625, + "rewards/rejected": 3.125, + "step": 35, + "train_speed(iter/s)": 0.346622 + }, + { + "epoch": 0.8, + "grad_norm": 13.025539300060226, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": 0.08740234375, + "logits/rejected": 0.71484375, + "logps/chosen": -225.0, + "logps/rejected": -322.0, + "loss": 0.6363037109375, + "memory(GiB)": 32.02, + "nll_loss": 0.56640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.0625, + "rewards/margins": 7.53125, + "rewards/rejected": 3.546875, + "step": 40, + "train_speed(iter/s)": 0.345794 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -0.8046875, + "eval_logits/rejected": 0.9375, + "eval_logps/chosen": -7.1875, + "eval_logps/rejected": -212.0, + "eval_loss": 0.5283203125, + "eval_nll_loss": 0.3125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 7.3125, + "eval_rewards/rejected": 2.796875, + "eval_runtime": 1.2905, + "eval_samples_per_second": 3.1, + "eval_steps_per_second": 0.775, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.49724108967226127, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": 2.890625, + "logits/rejected": 0.002349853515625, + "logps/chosen": -426.0, + "logps/rejected": -430.0, + "loss": 0.5437744140625, + "memory(GiB)": 32.02, + "nll_loss": 0.470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.0625, + "rewards/margins": 10.6875, + "rewards/rejected": 3.390625, + "step": 45, + "train_speed(iter/s)": 0.342553 + }, + { + "epoch": 1.0, + "grad_norm": 1.4155859018634052, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": 0.2451171875, + "logits/rejected": 1.75, + "logps/chosen": -804.0, + "logps/rejected": -1480.0, + "loss": 0.57803955078125, + "memory(GiB)": 32.02, + "nll_loss": 0.6953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.25, + "rewards/margins": 9.125, + "rewards/rejected": 7.125, + "step": 50, + "train_speed(iter/s)": 0.346029 + }, + { + "epoch": 1.1, + "grad_norm": 0.5694275670231188, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": 1.8359375, + "logits/rejected": 0.703125, + "logps/chosen": -386.0, + "logps/rejected": -612.0, + "loss": 0.6138671875, + "memory(GiB)": 43.66, + "nll_loss": 0.546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.75, + "rewards/margins": 12.25, + "rewards/rejected": 3.53125, + "step": 55, + "train_speed(iter/s)": 0.347089 + }, + { + "epoch": 1.2, + "grad_norm": 0.3472486708219598, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": 1.65625, + "logits/rejected": 2.15625, + "logps/chosen": -233.0, + "logps/rejected": -520.0, + "loss": 0.5795654296875, + "memory(GiB)": 43.66, + "nll_loss": 0.396484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.375, + "rewards/margins": 12.1875, + "rewards/rejected": 3.140625, + "step": 60, + "train_speed(iter/s)": 0.349928 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -0.23828125, + "eval_logits/rejected": 1.703125, + "eval_logps/chosen": -7.3125, + "eval_logps/rejected": -223.0, + "eval_loss": 0.51806640625, + "eval_nll_loss": 0.31640625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 8.4375, + "eval_rewards/rejected": 1.703125, + "eval_runtime": 1.2918, + "eval_samples_per_second": 3.096, + "eval_steps_per_second": 0.774, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.48830819519223906, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": 0.97265625, + "logits/rejected": 1.9921875, + "logps/chosen": -346.0, + "logps/rejected": -844.0, + "loss": 0.6193359375, + "memory(GiB)": 43.66, + "nll_loss": 0.640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5625, + "rewards/margins": 11.875, + "rewards/rejected": 3.703125, + "step": 65, + "train_speed(iter/s)": 0.34619 + }, + { + "epoch": 1.4, + "grad_norm": 0.3461001279478362, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -0.6875, + "logits/rejected": 2.28125, + "logps/chosen": -88.0, + "logps/rejected": -1120.0, + "loss": 0.5681396484375, + "memory(GiB)": 43.66, + "nll_loss": 0.84375, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 11.5625, + "rewards/margins": 9.125, + "rewards/rejected": 2.484375, + "step": 70, + "train_speed(iter/s)": 0.349389 + }, + { + "epoch": 1.5, + "grad_norm": 0.49738864814975586, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": 2.515625, + "logits/rejected": 0.263671875, + "logps/chosen": -462.0, + "logps/rejected": -434.0, + "loss": 0.517205810546875, + "memory(GiB)": 43.66, + "nll_loss": 0.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.625, + "rewards/margins": 16.375, + "rewards/rejected": 2.28125, + "step": 75, + "train_speed(iter/s)": 0.351345 + }, + { + "epoch": 1.6, + "grad_norm": 0.3498527388541882, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": 3.5625, + "logits/rejected": -1.296875, + "logps/chosen": -580.0, + "logps/rejected": -132.0, + "loss": 0.5629638671875, + "memory(GiB)": 43.66, + "nll_loss": 0.703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.5, + "rewards/margins": 15.6875, + "rewards/rejected": 5.84375, + "step": 80, + "train_speed(iter/s)": 0.353922 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -0.68359375, + "eval_logits/rejected": 0.3125, + "eval_logps/chosen": -6.3125, + "eval_logps/rejected": -306.0, + "eval_loss": 0.5009765625, + "eval_nll_loss": 0.2734375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.1875, + "eval_rewards/margins": 16.75, + "eval_rewards/rejected": -6.59375, + "eval_runtime": 1.3254, + "eval_samples_per_second": 3.018, + "eval_steps_per_second": 0.754, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.6649877164961882, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": 0.72265625, + "logits/rejected": 0.388671875, + "logps/chosen": -326.0, + "logps/rejected": -768.0, + "loss": 0.70977783203125, + "memory(GiB)": 43.66, + "nll_loss": 0.458984375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5, + "rewards/margins": 18.375, + "rewards/rejected": -2.859375, + "step": 85, + "train_speed(iter/s)": 0.351477 + }, + { + "epoch": 1.8, + "grad_norm": 0.5774391096810936, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": 2.15625, + "logits/rejected": 1.515625, + "logps/chosen": -290.0, + "logps/rejected": -520.0, + "loss": 0.5434326171875, + "memory(GiB)": 43.66, + "nll_loss": 0.56640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.75, + "rewards/margins": 14.25, + "rewards/rejected": 0.5390625, + "step": 90, + "train_speed(iter/s)": 0.351018 + }, + { + "epoch": 1.9, + "grad_norm": 0.46335079012099145, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": 1.65625, + "logits/rejected": 1.1640625, + "logps/chosen": -366.0, + "logps/rejected": -620.0, + "loss": 0.60682373046875, + "memory(GiB)": 43.66, + "nll_loss": 0.5, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.75, + "rewards/margins": 15.9375, + "rewards/rejected": 0.7890625, + "step": 95, + "train_speed(iter/s)": 0.351082 + }, + { + "epoch": 2.0, + "grad_norm": 0.44055183589508673, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": 0.59765625, + "logits/rejected": 0.1025390625, + "logps/chosen": -276.0, + "logps/rejected": -360.0, + "loss": 0.51856689453125, + "memory(GiB)": 43.66, + "nll_loss": 0.470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.6875, + "rewards/margins": 14.0, + "rewards/rejected": 1.65625, + "step": 100, + "train_speed(iter/s)": 0.350869 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -0.63671875, + "eval_logits/rejected": 0.66796875, + "eval_logps/chosen": -6.84375, + "eval_logps/rejected": -280.0, + "eval_loss": 0.50390625, + "eval_nll_loss": 0.298828125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 14.125, + "eval_rewards/rejected": -4.0, + "eval_runtime": 1.3254, + "eval_samples_per_second": 3.018, + "eval_steps_per_second": 0.754, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.43319830084665456, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": 3.5, + "logits/rejected": 0.185546875, + "logps/chosen": -442.0, + "logps/rejected": -580.0, + "loss": 0.540869140625, + "memory(GiB)": 43.66, + "nll_loss": 0.60546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.375, + "rewards/margins": 16.125, + "rewards/rejected": 4.25, + "step": 105, + "train_speed(iter/s)": 0.34793 + }, + { + "epoch": 2.2, + "grad_norm": 0.5087186205209226, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": 0.41015625, + "logits/rejected": 1.9375, + "logps/chosen": -688.0, + "logps/rejected": -1280.0, + "loss": 0.5629150390625, + "memory(GiB)": 43.66, + "nll_loss": 0.69140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.875, + "rewards/margins": 14.5, + "rewards/rejected": 6.34375, + "step": 110, + "train_speed(iter/s)": 0.34833 + }, + { + "epoch": 2.3, + "grad_norm": 0.30352973245628134, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": 1.546875, + "logits/rejected": 1.0625, + "logps/chosen": -568.0, + "logps/rejected": -704.0, + "loss": 0.596630859375, + "memory(GiB)": 43.66, + "nll_loss": 0.5390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.0, + "rewards/margins": 15.4375, + "rewards/rejected": 5.5, + "step": 115, + "train_speed(iter/s)": 0.349547 + }, + { + "epoch": 2.4, + "grad_norm": 0.5099680534446042, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": 2.40625, + "logits/rejected": 0.002349853515625, + "logps/chosen": -368.0, + "logps/rejected": -316.0, + "loss": 0.463055419921875, + "memory(GiB)": 43.66, + "nll_loss": 0.37109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.625, + "rewards/margins": 13.1875, + "rewards/rejected": 3.484375, + "step": 120, + "train_speed(iter/s)": 0.35126 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -0.96484375, + "eval_logits/rejected": 0.416015625, + "eval_logps/chosen": -6.4375, + "eval_logps/rejected": -280.0, + "eval_loss": 0.49755859375, + "eval_nll_loss": 0.279296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.1875, + "eval_rewards/margins": 14.1875, + "eval_rewards/rejected": -4.0, + "eval_runtime": 1.3007, + "eval_samples_per_second": 3.075, + "eval_steps_per_second": 0.769, + "step": 120 + }, + { + "epoch": 2.5, + "grad_norm": 0.6759212291558464, + "learning_rate": 5.430276272567485e-05, + "logits/chosen": -1.4140625, + "logits/rejected": 0.82421875, + "logps/chosen": -62.5, + "logps/rejected": -1288.0, + "loss": 0.5360504150390625, + "memory(GiB)": 43.66, + "nll_loss": 0.7265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.75, + "rewards/margins": 19.75, + "rewards/rejected": -7.96875, + "step": 125, + "train_speed(iter/s)": 0.350267 + }, + { + "epoch": 2.6, + "grad_norm": 0.3635118516286732, + "learning_rate": 5.0994109383253506e-05, + "logits/chosen": 2.3125, + "logits/rejected": 0.546875, + "logps/chosen": -348.0, + "logps/rejected": -314.0, + "loss": 0.4655029296875, + "memory(GiB)": 43.66, + "nll_loss": 0.484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.875, + "rewards/margins": 15.3125, + "rewards/rejected": 3.515625, + "step": 130, + "train_speed(iter/s)": 0.35164 + }, + { + "epoch": 2.7, + "grad_norm": 0.4546423377677983, + "learning_rate": 4.768109069909307e-05, + "logits/chosen": -0.375, + "logits/rejected": 0.88671875, + "logps/chosen": -157.0, + "logps/rejected": -624.0, + "loss": 0.44596214294433595, + "memory(GiB)": 43.66, + "nll_loss": 0.24609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.375, + "rewards/margins": 14.9375, + "rewards/rejected": -1.5625, + "step": 135, + "train_speed(iter/s)": 0.352504 + }, + { + "epoch": 2.8, + "grad_norm": 0.3766526906078867, + "learning_rate": 4.4378254829551396e-05, + "logits/chosen": 1.484375, + "logits/rejected": 1.1484375, + "logps/chosen": -392.0, + "logps/rejected": -708.0, + "loss": 0.4998992919921875, + "memory(GiB)": 43.66, + "nll_loss": 0.734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.625, + "rewards/margins": 17.0, + "rewards/rejected": 1.5546875, + "step": 140, + "train_speed(iter/s)": 0.353731 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -1.0546875, + "eval_logits/rejected": 0.28515625, + "eval_logps/chosen": -6.65625, + "eval_logps/rejected": -282.0, + "eval_loss": 0.49853515625, + "eval_nll_loss": 0.2890625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.1875, + "eval_rewards/margins": 14.375, + "eval_rewards/rejected": -4.1875, + "eval_runtime": 1.2976, + "eval_samples_per_second": 3.083, + "eval_steps_per_second": 0.771, + "step": 140 + }, + { + "epoch": 2.9, + "grad_norm": 0.7368475635982081, + "learning_rate": 4.11001052161225e-05, + "logits/chosen": 1.484375, + "logits/rejected": 0.7578125, + "logps/chosen": -624.0, + "logps/rejected": -664.0, + "loss": 0.5592437744140625, + "memory(GiB)": 43.66, + "nll_loss": 0.7421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.75, + "rewards/margins": 16.875, + "rewards/rejected": 4.90625, + "step": 145, + "train_speed(iter/s)": 0.350922 + }, + { + "epoch": 3.0, + "grad_norm": 0.526438371292198, + "learning_rate": 3.786103689779861e-05, + "logits/chosen": 2.53125, + "logits/rejected": -0.11962890625, + "logps/chosen": -406.0, + "logps/rejected": -310.0, + "loss": 0.4158843994140625, + "memory(GiB)": 43.66, + "nll_loss": 0.39453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.5, + "rewards/margins": 19.0, + "rewards/rejected": 3.5, + "step": 150, + "train_speed(iter/s)": 0.350374 + }, + { + "epoch": 3.1, + "grad_norm": 0.3832315044394523, + "learning_rate": 3.467527329945026e-05, + "logits/chosen": 1.1484375, + "logits/rejected": 1.328125, + "logps/chosen": -600.0, + "logps/rejected": -588.0, + "loss": 0.5336650848388672, + "memory(GiB)": 43.66, + "nll_loss": 0.515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.125, + "rewards/margins": 17.375, + "rewards/rejected": 1.7265625, + "step": 155, + "train_speed(iter/s)": 0.350973 + }, + { + "epoch": 3.2, + "grad_norm": 0.41804293149830996, + "learning_rate": 3.1556803773799614e-05, + "logits/chosen": 2.84375, + "logits/rejected": 0.10693359375, + "logps/chosen": -416.0, + "logps/rejected": -360.0, + "loss": 0.456103515625, + "memory(GiB)": 43.66, + "nll_loss": 0.38671875, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.5, + "rewards/margins": 16.75, + "rewards/rejected": 4.71875, + "step": 160, + "train_speed(iter/s)": 0.350912 + }, + { + "epoch": 3.2, + "eval_logits/chosen": -1.046875, + "eval_logits/rejected": 0.1669921875, + "eval_logps/chosen": -6.46875, + "eval_logps/rejected": -286.0, + "eval_loss": 0.4970703125, + "eval_nll_loss": 0.28125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.1875, + "eval_rewards/margins": 14.75, + "eval_rewards/rejected": -4.59375, + "eval_runtime": 1.2858, + "eval_samples_per_second": 3.111, + "eval_steps_per_second": 0.778, + "step": 160 + }, + { + "epoch": 3.3, + "grad_norm": 0.9977986802164632, + "learning_rate": 2.8519322171253602e-05, + "logits/chosen": -0.173828125, + "logits/rejected": 0.234375, + "logps/chosen": -145.0, + "logps/rejected": -498.0, + "loss": 0.4285240173339844, + "memory(GiB)": 43.66, + "nll_loss": 0.16796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.625, + "rewards/margins": 18.5, + "rewards/rejected": -1.7890625, + "step": 165, + "train_speed(iter/s)": 0.350263 + }, + { + "epoch": 3.4, + "grad_norm": 0.5073480559704879, + "learning_rate": 2.5576166707349385e-05, + "logits/chosen": 3.234375, + "logits/rejected": -0.92578125, + "logps/chosen": -640.0, + "logps/rejected": -207.0, + "loss": 0.45649566650390627, + "memory(GiB)": 43.66, + "nll_loss": 0.453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 25.0, + "rewards/margins": 19.125, + "rewards/rejected": 5.875, + "step": 170, + "train_speed(iter/s)": 0.35017 + }, + { + "epoch": 3.5, + "grad_norm": 0.5295452954524565, + "learning_rate": 2.2740261391866637e-05, + "logits/chosen": 0.498046875, + "logits/rejected": 0.392578125, + "logps/chosen": -466.0, + "logps/rejected": -592.0, + "loss": 0.39521121978759766, + "memory(GiB)": 43.66, + "nll_loss": 0.65625, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.75, + "rewards/margins": 14.75, + "rewards/rejected": 7.03125, + "step": 175, + "train_speed(iter/s)": 0.350866 + }, + { + "epoch": 3.6, + "grad_norm": 0.30567037033016853, + "learning_rate": 2.002405927680374e-05, + "logits/chosen": 3.09375, + "logits/rejected": -0.09326171875, + "logps/chosen": -482.0, + "logps/rejected": -262.0, + "loss": 0.500946044921875, + "memory(GiB)": 43.66, + "nll_loss": 0.546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.25, + "rewards/margins": 17.125, + "rewards/rejected": 4.125, + "step": 180, + "train_speed(iter/s)": 0.351966 + }, + { + "epoch": 3.6, + "eval_logits/chosen": -1.1640625, + "eval_logits/rejected": 0.060546875, + "eval_logps/chosen": -7.0, + "eval_logps/rejected": -284.0, + "eval_loss": 0.501953125, + "eval_nll_loss": 0.302734375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 14.5, + "eval_rewards/rejected": -4.40625, + "eval_runtime": 1.2632, + "eval_samples_per_second": 3.167, + "eval_steps_per_second": 0.792, + "step": 180 + }, + { + "epoch": 3.7, + "grad_norm": 0.9188395046874438, + "learning_rate": 1.743948777242814e-05, + "logits/chosen": 3.0, + "logits/rejected": -0.10009765625, + "logps/chosen": -536.0, + "logps/rejected": -229.0, + "loss": 0.4772364616394043, + "memory(GiB)": 43.66, + "nll_loss": 0.609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.25, + "rewards/margins": 16.875, + "rewards/rejected": 4.40625, + "step": 185, + "train_speed(iter/s)": 0.35126 + }, + { + "epoch": 3.8, + "grad_norm": 0.5288801717382393, + "learning_rate": 1.4997896271528739e-05, + "logits/chosen": 0.9765625, + "logits/rejected": 1.4140625, + "logps/chosen": -330.0, + "logps/rejected": -708.0, + "loss": 0.4629364013671875, + "memory(GiB)": 43.66, + "nll_loss": 0.6875, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.75, + "rewards/margins": 18.5, + "rewards/rejected": 1.34375, + "step": 190, + "train_speed(iter/s)": 0.352271 + }, + { + "epoch": 3.9, + "grad_norm": 0.5916934882939896, + "learning_rate": 1.2710006311864104e-05, + "logits/chosen": -0.31640625, + "logits/rejected": 0.78125, + "logps/chosen": -414.0, + "logps/rejected": -680.0, + "loss": 0.38446922302246095, + "memory(GiB)": 43.66, + "nll_loss": 0.435546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.75, + "rewards/margins": 20.0, + "rewards/rejected": 2.765625, + "step": 195, + "train_speed(iter/s)": 0.352064 + }, + { + "epoch": 4.0, + "grad_norm": 0.6513423080203622, + "learning_rate": 1.0585864495652897e-05, + "logits/chosen": 1.2578125, + "logits/rejected": 0.53125, + "logps/chosen": -414.0, + "logps/rejected": -728.0, + "loss": 0.43031768798828124, + "memory(GiB)": 56.51, + "nll_loss": 0.48046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.875, + "rewards/margins": 18.125, + "rewards/rejected": 3.78125, + "step": 200, + "train_speed(iter/s)": 0.351751 + }, + { + "epoch": 4.0, + "eval_logits/chosen": -1.265625, + "eval_logits/rejected": 0.0634765625, + "eval_logps/chosen": -7.5625, + "eval_logps/rejected": -286.0, + "eval_loss": 0.509765625, + "eval_nll_loss": 0.328125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 14.75, + "eval_rewards/rejected": -4.59375, + "eval_runtime": 1.2468, + "eval_samples_per_second": 3.208, + "eval_steps_per_second": 0.802, + "step": 200 + }, + { + "epoch": 4.1, + "grad_norm": 0.46757622724638764, + "learning_rate": 8.634798372847148e-06, + "logits/chosen": 0.8984375, + "logits/rejected": 1.0546875, + "logps/chosen": -428.0, + "logps/rejected": -704.0, + "loss": 0.4748867034912109, + "memory(GiB)": 56.51, + "nll_loss": 0.4375, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.5, + "rewards/margins": 16.125, + "rewards/rejected": 5.40625, + "step": 205, + "train_speed(iter/s)": 0.35013 + }, + { + "epoch": 4.2, + "grad_norm": 0.5521912104468201, + "learning_rate": 6.865375481914016e-06, + "logits/chosen": 2.734375, + "logits/rejected": -0.33984375, + "logps/chosen": -274.0, + "logps/rejected": -230.0, + "loss": 0.42456893920898436, + "memory(GiB)": 56.51, + "nll_loss": 0.34375, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.125, + "rewards/margins": 16.375, + "rewards/rejected": 3.71875, + "step": 210, + "train_speed(iter/s)": 0.350785 + }, + { + "epoch": 4.3, + "grad_norm": 0.6123257585261611, + "learning_rate": 5.285365727986707e-06, + "logits/chosen": -1.28125, + "logits/rejected": 1.8359375, + "logps/chosen": -211.0, + "logps/rejected": -1216.0, + "loss": 0.42719383239746095, + "memory(GiB)": 56.51, + "nll_loss": 0.326171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.0, + "rewards/margins": 18.25, + "rewards/rejected": 0.8046875, + "step": 215, + "train_speed(iter/s)": 0.35093 + }, + { + "epoch": 4.4, + "grad_norm": 0.3529237641617113, + "learning_rate": 3.901707263589671e-06, + "logits/chosen": -1.171875, + "logits/rejected": 1.640625, + "logps/chosen": -135.0, + "logps/rejected": -984.0, + "loss": 0.3799613952636719, + "memory(GiB)": 56.51, + "nll_loss": 0.236328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.25, + "rewards/margins": 18.375, + "rewards/rejected": -2.078125, + "step": 220, + "train_speed(iter/s)": 0.350899 + }, + { + "epoch": 4.4, + "eval_logits/chosen": -1.3125, + "eval_logits/rejected": 0.03662109375, + "eval_logps/chosen": -7.9375, + "eval_logps/rejected": -290.0, + "eval_loss": 0.51318359375, + "eval_nll_loss": 0.345703125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.0625, + "eval_rewards/margins": 15.0625, + "eval_rewards/rejected": -5.0, + "eval_runtime": 1.3289, + "eval_samples_per_second": 3.01, + "eval_steps_per_second": 0.753, + "step": 220 + }, + { + "epoch": 4.5, + "grad_norm": 0.4122296158011087, + "learning_rate": 2.7204760217631074e-06, + "logits/chosen": 4.0, + "logits/rejected": -0.318359375, + "logps/chosen": -696.0, + "logps/rejected": -280.0, + "loss": 0.42312088012695315, + "memory(GiB)": 56.51, + "nll_loss": 0.70703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 25.875, + "rewards/margins": 19.75, + "rewards/rejected": 6.09375, + "step": 225, + "train_speed(iter/s)": 0.350059 + }, + { + "epoch": 4.6, + "grad_norm": 0.446952222017286, + "learning_rate": 1.7468590353731495e-06, + "logits/chosen": 0.98828125, + "logits/rejected": 0.1640625, + "logps/chosen": -320.0, + "logps/rejected": -716.0, + "loss": 0.3846301078796387, + "memory(GiB)": 56.51, + "nll_loss": 0.419921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.5, + "rewards/margins": 20.375, + "rewards/rejected": 0.07177734375, + "step": 230, + "train_speed(iter/s)": 0.35023 + }, + { + "epoch": 4.7, + "grad_norm": 0.7152024714830698, + "learning_rate": 9.851316597681958e-07, + "logits/chosen": 0.1025390625, + "logits/rejected": 1.296875, + "logps/chosen": -221.0, + "logps/rejected": -712.0, + "loss": 0.3764499664306641, + "memory(GiB)": 56.51, + "nll_loss": 0.3046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.25, + "rewards/margins": 18.875, + "rewards/rejected": -1.71875, + "step": 235, + "train_speed(iter/s)": 0.350576 + }, + { + "epoch": 4.8, + "grad_norm": 0.32906938877953124, + "learning_rate": 4.386387988014273e-07, + "logits/chosen": 0.9375, + "logits/rejected": -0.416015625, + "logps/chosen": -556.0, + "logps/rejected": -828.0, + "loss": 0.4298358917236328, + "memory(GiB)": 56.51, + "nll_loss": 0.8515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 27.0, + "rewards/margins": 20.5, + "rewards/rejected": 6.5, + "step": 240, + "train_speed(iter/s)": 0.351336 + }, + { + "epoch": 4.8, + "eval_logits/chosen": -1.34375, + "eval_logits/rejected": 0.037109375, + "eval_logps/chosen": -8.3125, + "eval_logps/rejected": -292.0, + "eval_loss": 0.51806640625, + "eval_nll_loss": 0.361328125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.0, + "eval_rewards/margins": 15.1875, + "eval_rewards/rejected": -5.1875, + "eval_runtime": 1.2957, + "eval_samples_per_second": 3.087, + "eval_steps_per_second": 0.772, + "step": 240 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 210993673011200.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/training_args.bin b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..475462e40417630252a03e5a9352b5e11f000ed1 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:647dcffabf1a4c913f292b185978f2f179c4819a3df809536d284c803d9d97fe +size 8888 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/zero_to_fp32.py b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-240/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/README.md b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/README.md new file mode 100644 index 0000000000000000000000000000000000000000..525e0a2a5243ae77491e227cbf74fa8ff0a43579 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-7b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/adapter_config.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..454fa4ee7fc180fe11671ee7f6c90d0ba44328a6 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "gate_proj", + "up_proj", + "o_proj", + "v_proj", + "down_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/adapter_model.safetensors b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7644c7d9ae47dd19d57f788a238b97fd4d63b033 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66fd67d36c80191b8af10003caf1156098c0130ff5980a86b693b2b33b085b10 +size 40422208 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/additional_config.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/args.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/args.json new file mode 100644 index 0000000000000000000000000000000000000000..4b016b97ef2d4589dae9d4623f25925cb829d241 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-7b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-7b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6e2a35e471b0848da8f5a5b1269862c4601abc47 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09efdfd5222ec47fa48e42c726e4fd9e337ee95024de6db6c4cdff5792ac4ef1 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f77a9b7ebf0b639b17c86f27981960bb26948199 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:683d897f68bc0d912dd5e1703bc564ee32df1e9cefa48622216e4272da810837 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b42c0b59f7f9067b1f0869825affe73182c1e1fb --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78cf941fd182a2860b08e82b4ab900f259a80ab22c8b867634cd4f3b9fdb6f2d +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d524bce2474ef55be93723b5ab67ff629e22d48 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94d125c9d9186c58dc65c65f69e381c2207e73f8aa3a42b5c0188384badb2da8 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ab1c2279717e99894204be171b434295f357dad1 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2b08d1f527a868d020444f783157e00c681100f2e6389ca676f98f782fb253e +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ccddc28d11c85c9ccb233b137cf88a283984d5a4 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f68541ad5cdb47d996b40f30cee3e289ee51ff239d76c65d1612c62ac4ee6eca +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..61d8f232f35276b8f5d2d437f2c604a5f982a1de --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca3cb78bd5cf3b6f6b43b402ae1df59606ffdd5cb0849d86f256d347671a0370 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2efe5ada3b6c06cf1f2c7bc461d87f754fef0217 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83111d36cee33491230c7ebc0226063e16022735ed61b706ba6f1f9f4be1aac3 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f2c3092105c4a838a96c2dea0dd5c31665ceb57c --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fec8edc934f2a0185eb8d30195e1a18228c6a587d6ede6edc7a472769afdaf08 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a4a91a8ed2ccab0737058d66b97e21e6020cc59f --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87b495c64df796d442cba45624471689a1c2d7e068ae6ee0d2a719ced188b006 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3dc81e60dffc359aa3a37ede87157f93c3745f1d --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0346365794437a1ed6e9e6a88dfdf5650d29e62fd02a7355b3c96f353deb833e +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..941bc625f67a5fead600462fdb6b6317c407684e --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc3a4043801893038506235e314e171b7942d3960c3e75f30adeb977aa6060cc +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..79465bea1de868b1557924ea4023fb8d3f4b181f --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:216177741221aa8d9a45f8585fe251872247174757ef55af1684973e598f2f64 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e9035b091272d7f2a54d7d89cc475ab68cba100 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b8498ede4d568525f1bf3712c7dbd8e843ba29417d5b2981c11f0c078f1b27b +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..da634b9f398dc9e9f863bcf5bc316dad4a06c550 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b212534471341f0f565d30dc3e4f2aac258c769b89a738555183ae823e048cf1 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec1e0abd5d3374b54543316a954676205b438c5b --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/global_step250/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffea896ff3582118da578f1f1a59e2ff36da809e5a882e40a07d0fe41ca7c37f +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/latest b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/latest new file mode 100644 index 0000000000000000000000000000000000000000..87449ff1a854ba4a77ea33fbc24adaed3311d6b1 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/latest @@ -0,0 +1 @@ +global_step250 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/rng_state_0.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ab29abc7c5c196288fd5c119c67c4f655f27d44c --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f5c4738c31c5c9a38e1f586256d59a0e8e7d02641b9b9af2afdbe078440aeb4 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/rng_state_1.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d8e0ba47a098b34da66857368b41c80a5d9d796f --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d374b3390eb52ec7f6161c06272d4f26cb715692bdf2ad5374287b6de420ca3 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/rng_state_2.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..7676e48e7dd332be5f46585fc5f824c5791f76ae --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24111edc5a6a2994166cd410155ee3c630816d0fe21c13808ebd2a2ae45bc9d8 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/rng_state_3.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..228202ae722c05ed5fafc13eeac33a8a2685cca5 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:157b21eda1c7f898e519251deed08049767ffba123797289de56343a92ba7380 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/rng_state_4.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..48a63de21fa3e29782ced5828f8f34fba46bad33 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ccb615552e5845759bc13aa2ae50c0525fbf941fa76ee2e2c20cb9838fe1995 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/rng_state_5.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..d487727115f1120e55e91ad9583fb23ff8e34083 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fcf720fc22147ce563d6f2c2f6f3d916a7e8b7af174b480d072b5c822e992aa +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/rng_state_6.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d90628d8fd79ee2a98fb904251b6d7938f5120b0 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d055d3b033dc8e6fc2a19aa95162960544ab94a903988874315efe4ed5aa8e13 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/rng_state_7.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..54e1556a7ec04e7309f4c9130351c880ef6a0626 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e03c685f2e019350bfdd41f006495a18690aacbccd7ffc1f40de827f433eb87 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/scheduler.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..871b4a6cbd60ea4b2ef2416f3a46bbe632ddb667 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e80b1af2ae92a304371e36f6c1b7001f5dafc395be0b17c480957fc7fb58d8cd +size 1064 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/trainer_state.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..40bfbf99c55462ff2d69e91b358740a02951bd75 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/trainer_state.json @@ -0,0 +1,1172 @@ +{ + "best_metric": 0.49707031, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160", + "epoch": 5.0, + "eval_steps": 20, + "global_step": 250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 9.15866064284304, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": 2.390625, + "logits/rejected": -2.546875, + "logps/chosen": -520.0, + "logps/rejected": -142.0, + "loss": 2.4697265625, + "memory(GiB)": 8.4, + "nll_loss": 0.51171875, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.126463 + }, + { + "epoch": 0.1, + "grad_norm": 26.585759281524314, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 0.1533203125, + "logits/rejected": -0.8671875, + "logps/chosen": -728.0, + "logps/rejected": -304.0, + "loss": 2.3878173828125, + "memory(GiB)": 12.04, + "nll_loss": 1.1484375, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.125, + "rewards/margins": -0.0250244140625, + "rewards/rejected": 0.150390625, + "step": 5, + "train_speed(iter/s)": 0.276117 + }, + { + "epoch": 0.2, + "grad_norm": 8.664467296577401, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.419921875, + "logits/rejected": 1.2890625, + "logps/chosen": -644.0, + "logps/rejected": -572.0, + "loss": 2.18505859375, + "memory(GiB)": 17.87, + "nll_loss": 1.4921875, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.1103515625, + "rewards/margins": 0.08984375, + "rewards/rejected": 0.0201416015625, + "step": 10, + "train_speed(iter/s)": 0.313547 + }, + { + "epoch": 0.3, + "grad_norm": 6.325430646213694, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.7578125, + "logits/rejected": 1.8203125, + "logps/chosen": -924.0, + "logps/rejected": -1552.0, + "loss": 1.76845703125, + "memory(GiB)": 32.02, + "nll_loss": 1.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.75, + "rewards/margins": 2.390625, + "rewards/rejected": 0.359375, + "step": 15, + "train_speed(iter/s)": 0.318522 + }, + { + "epoch": 0.4, + "grad_norm": 2.351010309283972, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": 0.5859375, + "logits/rejected": -0.4375, + "logps/chosen": -432.0, + "logps/rejected": -324.0, + "loss": 1.172265625, + "memory(GiB)": 32.02, + "nll_loss": 0.6640625, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 4.65625, + "rewards/margins": 1.5546875, + "rewards/rejected": 3.09375, + "step": 20, + "train_speed(iter/s)": 0.331194 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.3125, + "eval_logits/rejected": 0.86328125, + "eval_logps/chosen": -14.3125, + "eval_logps/rejected": -182.0, + "eval_loss": 0.6728515625, + "eval_nll_loss": 0.62109375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.375, + "eval_rewards/margins": 3.5625, + "eval_rewards/rejected": 5.8125, + "eval_runtime": 1.2862, + "eval_samples_per_second": 3.11, + "eval_steps_per_second": 0.778, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.5984307422346672, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": 0.255859375, + "logits/rejected": -0.0390625, + "logps/chosen": -326.0, + "logps/rejected": -420.0, + "loss": 0.8047607421875, + "memory(GiB)": 32.02, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.125, + "rewards/margins": 5.40625, + "rewards/rejected": 3.6875, + "step": 25, + "train_speed(iter/s)": 0.330795 + }, + { + "epoch": 0.6, + "grad_norm": 0.5223734060204478, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -0.21875, + "logits/rejected": 1.546875, + "logps/chosen": -400.0, + "logps/rejected": -656.0, + "loss": 0.6901611328125, + "memory(GiB)": 32.02, + "nll_loss": 0.78125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.5625, + "rewards/margins": 6.78125, + "rewards/rejected": 3.765625, + "step": 30, + "train_speed(iter/s)": 0.339407 + }, + { + "epoch": 0.7, + "grad_norm": 1.0839186250481967, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": 1.5625, + "logits/rejected": -1.578125, + "logps/chosen": -592.0, + "logps/rejected": -322.0, + "loss": 0.7055419921875, + "memory(GiB)": 32.02, + "nll_loss": 0.96875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.75, + "rewards/margins": 9.625, + "rewards/rejected": 3.125, + "step": 35, + "train_speed(iter/s)": 0.346622 + }, + { + "epoch": 0.8, + "grad_norm": 13.025539300060226, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": 0.08740234375, + "logits/rejected": 0.71484375, + "logps/chosen": -225.0, + "logps/rejected": -322.0, + "loss": 0.6363037109375, + "memory(GiB)": 32.02, + "nll_loss": 0.56640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.0625, + "rewards/margins": 7.53125, + "rewards/rejected": 3.546875, + "step": 40, + "train_speed(iter/s)": 0.345794 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -0.8046875, + "eval_logits/rejected": 0.9375, + "eval_logps/chosen": -7.1875, + "eval_logps/rejected": -212.0, + "eval_loss": 0.5283203125, + "eval_nll_loss": 0.3125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 7.3125, + "eval_rewards/rejected": 2.796875, + "eval_runtime": 1.2905, + "eval_samples_per_second": 3.1, + "eval_steps_per_second": 0.775, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.49724108967226127, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": 2.890625, + "logits/rejected": 0.002349853515625, + "logps/chosen": -426.0, + "logps/rejected": -430.0, + "loss": 0.5437744140625, + "memory(GiB)": 32.02, + "nll_loss": 0.470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.0625, + "rewards/margins": 10.6875, + "rewards/rejected": 3.390625, + "step": 45, + "train_speed(iter/s)": 0.342553 + }, + { + "epoch": 1.0, + "grad_norm": 1.4155859018634052, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": 0.2451171875, + "logits/rejected": 1.75, + "logps/chosen": -804.0, + "logps/rejected": -1480.0, + "loss": 0.57803955078125, + "memory(GiB)": 32.02, + "nll_loss": 0.6953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.25, + "rewards/margins": 9.125, + "rewards/rejected": 7.125, + "step": 50, + "train_speed(iter/s)": 0.346029 + }, + { + "epoch": 1.1, + "grad_norm": 0.5694275670231188, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": 1.8359375, + "logits/rejected": 0.703125, + "logps/chosen": -386.0, + "logps/rejected": -612.0, + "loss": 0.6138671875, + "memory(GiB)": 43.66, + "nll_loss": 0.546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.75, + "rewards/margins": 12.25, + "rewards/rejected": 3.53125, + "step": 55, + "train_speed(iter/s)": 0.347089 + }, + { + "epoch": 1.2, + "grad_norm": 0.3472486708219598, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": 1.65625, + "logits/rejected": 2.15625, + "logps/chosen": -233.0, + "logps/rejected": -520.0, + "loss": 0.5795654296875, + "memory(GiB)": 43.66, + "nll_loss": 0.396484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.375, + "rewards/margins": 12.1875, + "rewards/rejected": 3.140625, + "step": 60, + "train_speed(iter/s)": 0.349928 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -0.23828125, + "eval_logits/rejected": 1.703125, + "eval_logps/chosen": -7.3125, + "eval_logps/rejected": -223.0, + "eval_loss": 0.51806640625, + "eval_nll_loss": 0.31640625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 8.4375, + "eval_rewards/rejected": 1.703125, + "eval_runtime": 1.2918, + "eval_samples_per_second": 3.096, + "eval_steps_per_second": 0.774, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.48830819519223906, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": 0.97265625, + "logits/rejected": 1.9921875, + "logps/chosen": -346.0, + "logps/rejected": -844.0, + "loss": 0.6193359375, + "memory(GiB)": 43.66, + "nll_loss": 0.640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5625, + "rewards/margins": 11.875, + "rewards/rejected": 3.703125, + "step": 65, + "train_speed(iter/s)": 0.34619 + }, + { + "epoch": 1.4, + "grad_norm": 0.3461001279478362, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -0.6875, + "logits/rejected": 2.28125, + "logps/chosen": -88.0, + "logps/rejected": -1120.0, + "loss": 0.5681396484375, + "memory(GiB)": 43.66, + "nll_loss": 0.84375, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 11.5625, + "rewards/margins": 9.125, + "rewards/rejected": 2.484375, + "step": 70, + "train_speed(iter/s)": 0.349389 + }, + { + "epoch": 1.5, + "grad_norm": 0.49738864814975586, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": 2.515625, + "logits/rejected": 0.263671875, + "logps/chosen": -462.0, + "logps/rejected": -434.0, + "loss": 0.517205810546875, + "memory(GiB)": 43.66, + "nll_loss": 0.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.625, + "rewards/margins": 16.375, + "rewards/rejected": 2.28125, + "step": 75, + "train_speed(iter/s)": 0.351345 + }, + { + "epoch": 1.6, + "grad_norm": 0.3498527388541882, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": 3.5625, + "logits/rejected": -1.296875, + "logps/chosen": -580.0, + "logps/rejected": -132.0, + "loss": 0.5629638671875, + "memory(GiB)": 43.66, + "nll_loss": 0.703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.5, + "rewards/margins": 15.6875, + "rewards/rejected": 5.84375, + "step": 80, + "train_speed(iter/s)": 0.353922 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -0.68359375, + "eval_logits/rejected": 0.3125, + "eval_logps/chosen": -6.3125, + "eval_logps/rejected": -306.0, + "eval_loss": 0.5009765625, + "eval_nll_loss": 0.2734375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.1875, + "eval_rewards/margins": 16.75, + "eval_rewards/rejected": -6.59375, + "eval_runtime": 1.3254, + "eval_samples_per_second": 3.018, + "eval_steps_per_second": 0.754, + "step": 80 + }, + { + "epoch": 1.7, + "grad_norm": 0.6649877164961882, + "learning_rate": 7.890460001124242e-05, + "logits/chosen": 0.72265625, + "logits/rejected": 0.388671875, + "logps/chosen": -326.0, + "logps/rejected": -768.0, + "loss": 0.70977783203125, + "memory(GiB)": 43.66, + "nll_loss": 0.458984375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5, + "rewards/margins": 18.375, + "rewards/rejected": -2.859375, + "step": 85, + "train_speed(iter/s)": 0.351477 + }, + { + "epoch": 1.8, + "grad_norm": 0.5774391096810936, + "learning_rate": 7.613905469171246e-05, + "logits/chosen": 2.15625, + "logits/rejected": 1.515625, + "logps/chosen": -290.0, + "logps/rejected": -520.0, + "loss": 0.5434326171875, + "memory(GiB)": 43.66, + "nll_loss": 0.56640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.75, + "rewards/margins": 14.25, + "rewards/rejected": 0.5390625, + "step": 90, + "train_speed(iter/s)": 0.351018 + }, + { + "epoch": 1.9, + "grad_norm": 0.46335079012099145, + "learning_rate": 7.325872732868869e-05, + "logits/chosen": 1.65625, + "logits/rejected": 1.1640625, + "logps/chosen": -366.0, + "logps/rejected": -620.0, + "loss": 0.60682373046875, + "memory(GiB)": 43.66, + "nll_loss": 0.5, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.75, + "rewards/margins": 15.9375, + "rewards/rejected": 0.7890625, + "step": 95, + "train_speed(iter/s)": 0.351082 + }, + { + "epoch": 2.0, + "grad_norm": 0.44055183589508673, + "learning_rate": 7.027626604064969e-05, + "logits/chosen": 0.59765625, + "logits/rejected": 0.1025390625, + "logps/chosen": -276.0, + "logps/rejected": -360.0, + "loss": 0.51856689453125, + "memory(GiB)": 43.66, + "nll_loss": 0.470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.6875, + "rewards/margins": 14.0, + "rewards/rejected": 1.65625, + "step": 100, + "train_speed(iter/s)": 0.350869 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -0.63671875, + "eval_logits/rejected": 0.66796875, + "eval_logps/chosen": -6.84375, + "eval_logps/rejected": -280.0, + "eval_loss": 0.50390625, + "eval_nll_loss": 0.298828125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 14.125, + "eval_rewards/rejected": -4.0, + "eval_runtime": 1.3254, + "eval_samples_per_second": 3.018, + "eval_steps_per_second": 0.754, + "step": 100 + }, + { + "epoch": 2.1, + "grad_norm": 0.43319830084665456, + "learning_rate": 6.720476743745072e-05, + "logits/chosen": 3.5, + "logits/rejected": 0.185546875, + "logps/chosen": -442.0, + "logps/rejected": -580.0, + "loss": 0.540869140625, + "memory(GiB)": 43.66, + "nll_loss": 0.60546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.375, + "rewards/margins": 16.125, + "rewards/rejected": 4.25, + "step": 105, + "train_speed(iter/s)": 0.34793 + }, + { + "epoch": 2.2, + "grad_norm": 0.5087186205209226, + "learning_rate": 6.405771911037699e-05, + "logits/chosen": 0.41015625, + "logits/rejected": 1.9375, + "logps/chosen": -688.0, + "logps/rejected": -1280.0, + "loss": 0.5629150390625, + "memory(GiB)": 43.66, + "nll_loss": 0.69140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.875, + "rewards/margins": 14.5, + "rewards/rejected": 6.34375, + "step": 110, + "train_speed(iter/s)": 0.34833 + }, + { + "epoch": 2.3, + "grad_norm": 0.30352973245628134, + "learning_rate": 6.08489404053159e-05, + "logits/chosen": 1.546875, + "logits/rejected": 1.0625, + "logps/chosen": -568.0, + "logps/rejected": -704.0, + "loss": 0.596630859375, + "memory(GiB)": 43.66, + "nll_loss": 0.5390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.0, + "rewards/margins": 15.4375, + "rewards/rejected": 5.5, + "step": 115, + "train_speed(iter/s)": 0.349547 + }, + { + "epoch": 2.4, + "grad_norm": 0.5099680534446042, + "learning_rate": 5.7592521739125726e-05, + "logits/chosen": 2.40625, + "logits/rejected": 0.002349853515625, + "logps/chosen": -368.0, + "logps/rejected": -316.0, + "loss": 0.463055419921875, + "memory(GiB)": 43.66, + "nll_loss": 0.37109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.625, + "rewards/margins": 13.1875, + "rewards/rejected": 3.484375, + "step": 120, + "train_speed(iter/s)": 0.35126 + }, + { + "epoch": 2.4, + "eval_logits/chosen": -0.96484375, + "eval_logits/rejected": 0.416015625, + "eval_logps/chosen": -6.4375, + "eval_logps/rejected": -280.0, + "eval_loss": 0.49755859375, + "eval_nll_loss": 0.279296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.1875, + "eval_rewards/margins": 14.1875, + "eval_rewards/rejected": -4.0, + "eval_runtime": 1.3007, + "eval_samples_per_second": 3.075, + "eval_steps_per_second": 0.769, + "step": 120 + }, + { + "epoch": 2.5, + "grad_norm": 0.6759212291558464, + "learning_rate": 5.430276272567485e-05, + "logits/chosen": -1.4140625, + "logits/rejected": 0.82421875, + "logps/chosen": -62.5, + "logps/rejected": -1288.0, + "loss": 0.5360504150390625, + "memory(GiB)": 43.66, + "nll_loss": 0.7265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.75, + "rewards/margins": 19.75, + "rewards/rejected": -7.96875, + "step": 125, + "train_speed(iter/s)": 0.350267 + }, + { + "epoch": 2.6, + "grad_norm": 0.3635118516286732, + "learning_rate": 5.0994109383253506e-05, + "logits/chosen": 2.3125, + "logits/rejected": 0.546875, + "logps/chosen": -348.0, + "logps/rejected": -314.0, + "loss": 0.4655029296875, + "memory(GiB)": 43.66, + "nll_loss": 0.484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.875, + "rewards/margins": 15.3125, + "rewards/rejected": 3.515625, + "step": 130, + "train_speed(iter/s)": 0.35164 + }, + { + "epoch": 2.7, + "grad_norm": 0.4546423377677983, + "learning_rate": 4.768109069909307e-05, + "logits/chosen": -0.375, + "logits/rejected": 0.88671875, + "logps/chosen": -157.0, + "logps/rejected": -624.0, + "loss": 0.44596214294433595, + "memory(GiB)": 43.66, + "nll_loss": 0.24609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.375, + "rewards/margins": 14.9375, + "rewards/rejected": -1.5625, + "step": 135, + "train_speed(iter/s)": 0.352504 + }, + { + "epoch": 2.8, + "grad_norm": 0.3766526906078867, + "learning_rate": 4.4378254829551396e-05, + "logits/chosen": 1.484375, + "logits/rejected": 1.1484375, + "logps/chosen": -392.0, + "logps/rejected": -708.0, + "loss": 0.4998992919921875, + "memory(GiB)": 43.66, + "nll_loss": 0.734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.625, + "rewards/margins": 17.0, + "rewards/rejected": 1.5546875, + "step": 140, + "train_speed(iter/s)": 0.353731 + }, + { + "epoch": 2.8, + "eval_logits/chosen": -1.0546875, + "eval_logits/rejected": 0.28515625, + "eval_logps/chosen": -6.65625, + "eval_logps/rejected": -282.0, + "eval_loss": 0.49853515625, + "eval_nll_loss": 0.2890625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.1875, + "eval_rewards/margins": 14.375, + "eval_rewards/rejected": -4.1875, + "eval_runtime": 1.2976, + "eval_samples_per_second": 3.083, + "eval_steps_per_second": 0.771, + "step": 140 + }, + { + "epoch": 2.9, + "grad_norm": 0.7368475635982081, + "learning_rate": 4.11001052161225e-05, + "logits/chosen": 1.484375, + "logits/rejected": 0.7578125, + "logps/chosen": -624.0, + "logps/rejected": -664.0, + "loss": 0.5592437744140625, + "memory(GiB)": 43.66, + "nll_loss": 0.7421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.75, + "rewards/margins": 16.875, + "rewards/rejected": 4.90625, + "step": 145, + "train_speed(iter/s)": 0.350922 + }, + { + "epoch": 3.0, + "grad_norm": 0.526438371292198, + "learning_rate": 3.786103689779861e-05, + "logits/chosen": 2.53125, + "logits/rejected": -0.11962890625, + "logps/chosen": -406.0, + "logps/rejected": -310.0, + "loss": 0.4158843994140625, + "memory(GiB)": 43.66, + "nll_loss": 0.39453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.5, + "rewards/margins": 19.0, + "rewards/rejected": 3.5, + "step": 150, + "train_speed(iter/s)": 0.350374 + }, + { + "epoch": 3.1, + "grad_norm": 0.3832315044394523, + "learning_rate": 3.467527329945026e-05, + "logits/chosen": 1.1484375, + "logits/rejected": 1.328125, + "logps/chosen": -600.0, + "logps/rejected": -588.0, + "loss": 0.5336650848388672, + "memory(GiB)": 43.66, + "nll_loss": 0.515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.125, + "rewards/margins": 17.375, + "rewards/rejected": 1.7265625, + "step": 155, + "train_speed(iter/s)": 0.350973 + }, + { + "epoch": 3.2, + "grad_norm": 0.41804293149830996, + "learning_rate": 3.1556803773799614e-05, + "logits/chosen": 2.84375, + "logits/rejected": 0.10693359375, + "logps/chosen": -416.0, + "logps/rejected": -360.0, + "loss": 0.456103515625, + "memory(GiB)": 43.66, + "nll_loss": 0.38671875, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.5, + "rewards/margins": 16.75, + "rewards/rejected": 4.71875, + "step": 160, + "train_speed(iter/s)": 0.350912 + }, + { + "epoch": 3.2, + "eval_logits/chosen": -1.046875, + "eval_logits/rejected": 0.1669921875, + "eval_logps/chosen": -6.46875, + "eval_logps/rejected": -286.0, + "eval_loss": 0.4970703125, + "eval_nll_loss": 0.28125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.1875, + "eval_rewards/margins": 14.75, + "eval_rewards/rejected": -4.59375, + "eval_runtime": 1.2858, + "eval_samples_per_second": 3.111, + "eval_steps_per_second": 0.778, + "step": 160 + }, + { + "epoch": 3.3, + "grad_norm": 0.9977986802164632, + "learning_rate": 2.8519322171253602e-05, + "logits/chosen": -0.173828125, + "logits/rejected": 0.234375, + "logps/chosen": -145.0, + "logps/rejected": -498.0, + "loss": 0.4285240173339844, + "memory(GiB)": 43.66, + "nll_loss": 0.16796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.625, + "rewards/margins": 18.5, + "rewards/rejected": -1.7890625, + "step": 165, + "train_speed(iter/s)": 0.350263 + }, + { + "epoch": 3.4, + "grad_norm": 0.5073480559704879, + "learning_rate": 2.5576166707349385e-05, + "logits/chosen": 3.234375, + "logits/rejected": -0.92578125, + "logps/chosen": -640.0, + "logps/rejected": -207.0, + "loss": 0.45649566650390627, + "memory(GiB)": 43.66, + "nll_loss": 0.453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 25.0, + "rewards/margins": 19.125, + "rewards/rejected": 5.875, + "step": 170, + "train_speed(iter/s)": 0.35017 + }, + { + "epoch": 3.5, + "grad_norm": 0.5295452954524565, + "learning_rate": 2.2740261391866637e-05, + "logits/chosen": 0.498046875, + "logits/rejected": 0.392578125, + "logps/chosen": -466.0, + "logps/rejected": -592.0, + "loss": 0.39521121978759766, + "memory(GiB)": 43.66, + "nll_loss": 0.65625, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.75, + "rewards/margins": 14.75, + "rewards/rejected": 7.03125, + "step": 175, + "train_speed(iter/s)": 0.350866 + }, + { + "epoch": 3.6, + "grad_norm": 0.30567037033016853, + "learning_rate": 2.002405927680374e-05, + "logits/chosen": 3.09375, + "logits/rejected": -0.09326171875, + "logps/chosen": -482.0, + "logps/rejected": -262.0, + "loss": 0.500946044921875, + "memory(GiB)": 43.66, + "nll_loss": 0.546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.25, + "rewards/margins": 17.125, + "rewards/rejected": 4.125, + "step": 180, + "train_speed(iter/s)": 0.351966 + }, + { + "epoch": 3.6, + "eval_logits/chosen": -1.1640625, + "eval_logits/rejected": 0.060546875, + "eval_logps/chosen": -7.0, + "eval_logps/rejected": -284.0, + "eval_loss": 0.501953125, + "eval_nll_loss": 0.302734375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 14.5, + "eval_rewards/rejected": -4.40625, + "eval_runtime": 1.2632, + "eval_samples_per_second": 3.167, + "eval_steps_per_second": 0.792, + "step": 180 + }, + { + "epoch": 3.7, + "grad_norm": 0.9188395046874438, + "learning_rate": 1.743948777242814e-05, + "logits/chosen": 3.0, + "logits/rejected": -0.10009765625, + "logps/chosen": -536.0, + "logps/rejected": -229.0, + "loss": 0.4772364616394043, + "memory(GiB)": 43.66, + "nll_loss": 0.609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.25, + "rewards/margins": 16.875, + "rewards/rejected": 4.40625, + "step": 185, + "train_speed(iter/s)": 0.35126 + }, + { + "epoch": 3.8, + "grad_norm": 0.5288801717382393, + "learning_rate": 1.4997896271528739e-05, + "logits/chosen": 0.9765625, + "logits/rejected": 1.4140625, + "logps/chosen": -330.0, + "logps/rejected": -708.0, + "loss": 0.4629364013671875, + "memory(GiB)": 43.66, + "nll_loss": 0.6875, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.75, + "rewards/margins": 18.5, + "rewards/rejected": 1.34375, + "step": 190, + "train_speed(iter/s)": 0.352271 + }, + { + "epoch": 3.9, + "grad_norm": 0.5916934882939896, + "learning_rate": 1.2710006311864104e-05, + "logits/chosen": -0.31640625, + "logits/rejected": 0.78125, + "logps/chosen": -414.0, + "logps/rejected": -680.0, + "loss": 0.38446922302246095, + "memory(GiB)": 43.66, + "nll_loss": 0.435546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.75, + "rewards/margins": 20.0, + "rewards/rejected": 2.765625, + "step": 195, + "train_speed(iter/s)": 0.352064 + }, + { + "epoch": 4.0, + "grad_norm": 0.6513423080203622, + "learning_rate": 1.0585864495652897e-05, + "logits/chosen": 1.2578125, + "logits/rejected": 0.53125, + "logps/chosen": -414.0, + "logps/rejected": -728.0, + "loss": 0.43031768798828124, + "memory(GiB)": 56.51, + "nll_loss": 0.48046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.875, + "rewards/margins": 18.125, + "rewards/rejected": 3.78125, + "step": 200, + "train_speed(iter/s)": 0.351751 + }, + { + "epoch": 4.0, + "eval_logits/chosen": -1.265625, + "eval_logits/rejected": 0.0634765625, + "eval_logps/chosen": -7.5625, + "eval_logps/rejected": -286.0, + "eval_loss": 0.509765625, + "eval_nll_loss": 0.328125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 14.75, + "eval_rewards/rejected": -4.59375, + "eval_runtime": 1.2468, + "eval_samples_per_second": 3.208, + "eval_steps_per_second": 0.802, + "step": 200 + }, + { + "epoch": 4.1, + "grad_norm": 0.46757622724638764, + "learning_rate": 8.634798372847148e-06, + "logits/chosen": 0.8984375, + "logits/rejected": 1.0546875, + "logps/chosen": -428.0, + "logps/rejected": -704.0, + "loss": 0.4748867034912109, + "memory(GiB)": 56.51, + "nll_loss": 0.4375, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.5, + "rewards/margins": 16.125, + "rewards/rejected": 5.40625, + "step": 205, + "train_speed(iter/s)": 0.35013 + }, + { + "epoch": 4.2, + "grad_norm": 0.5521912104468201, + "learning_rate": 6.865375481914016e-06, + "logits/chosen": 2.734375, + "logits/rejected": -0.33984375, + "logps/chosen": -274.0, + "logps/rejected": -230.0, + "loss": 0.42456893920898436, + "memory(GiB)": 56.51, + "nll_loss": 0.34375, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.125, + "rewards/margins": 16.375, + "rewards/rejected": 3.71875, + "step": 210, + "train_speed(iter/s)": 0.350785 + }, + { + "epoch": 4.3, + "grad_norm": 0.6123257585261611, + "learning_rate": 5.285365727986707e-06, + "logits/chosen": -1.28125, + "logits/rejected": 1.8359375, + "logps/chosen": -211.0, + "logps/rejected": -1216.0, + "loss": 0.42719383239746095, + "memory(GiB)": 56.51, + "nll_loss": 0.326171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 19.0, + "rewards/margins": 18.25, + "rewards/rejected": 0.8046875, + "step": 215, + "train_speed(iter/s)": 0.35093 + }, + { + "epoch": 4.4, + "grad_norm": 0.3529237641617113, + "learning_rate": 3.901707263589671e-06, + "logits/chosen": -1.171875, + "logits/rejected": 1.640625, + "logps/chosen": -135.0, + "logps/rejected": -984.0, + "loss": 0.3799613952636719, + "memory(GiB)": 56.51, + "nll_loss": 0.236328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.25, + "rewards/margins": 18.375, + "rewards/rejected": -2.078125, + "step": 220, + "train_speed(iter/s)": 0.350899 + }, + { + "epoch": 4.4, + "eval_logits/chosen": -1.3125, + "eval_logits/rejected": 0.03662109375, + "eval_logps/chosen": -7.9375, + "eval_logps/rejected": -290.0, + "eval_loss": 0.51318359375, + "eval_nll_loss": 0.345703125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.0625, + "eval_rewards/margins": 15.0625, + "eval_rewards/rejected": -5.0, + "eval_runtime": 1.3289, + "eval_samples_per_second": 3.01, + "eval_steps_per_second": 0.753, + "step": 220 + }, + { + "epoch": 4.5, + "grad_norm": 0.4122296158011087, + "learning_rate": 2.7204760217631074e-06, + "logits/chosen": 4.0, + "logits/rejected": -0.318359375, + "logps/chosen": -696.0, + "logps/rejected": -280.0, + "loss": 0.42312088012695315, + "memory(GiB)": 56.51, + "nll_loss": 0.70703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 25.875, + "rewards/margins": 19.75, + "rewards/rejected": 6.09375, + "step": 225, + "train_speed(iter/s)": 0.350059 + }, + { + "epoch": 4.6, + "grad_norm": 0.446952222017286, + "learning_rate": 1.7468590353731495e-06, + "logits/chosen": 0.98828125, + "logits/rejected": 0.1640625, + "logps/chosen": -320.0, + "logps/rejected": -716.0, + "loss": 0.3846301078796387, + "memory(GiB)": 56.51, + "nll_loss": 0.419921875, + "rewards/accuracies": 1.0, + "rewards/chosen": 20.5, + "rewards/margins": 20.375, + "rewards/rejected": 0.07177734375, + "step": 230, + "train_speed(iter/s)": 0.35023 + }, + { + "epoch": 4.7, + "grad_norm": 0.7152024714830698, + "learning_rate": 9.851316597681958e-07, + "logits/chosen": 0.1025390625, + "logits/rejected": 1.296875, + "logps/chosen": -221.0, + "logps/rejected": -712.0, + "loss": 0.3764499664306641, + "memory(GiB)": 56.51, + "nll_loss": 0.3046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.25, + "rewards/margins": 18.875, + "rewards/rejected": -1.71875, + "step": 235, + "train_speed(iter/s)": 0.350576 + }, + { + "epoch": 4.8, + "grad_norm": 0.32906938877953124, + "learning_rate": 4.386387988014273e-07, + "logits/chosen": 0.9375, + "logits/rejected": -0.416015625, + "logps/chosen": -556.0, + "logps/rejected": -828.0, + "loss": 0.4298358917236328, + "memory(GiB)": 56.51, + "nll_loss": 0.8515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 27.0, + "rewards/margins": 20.5, + "rewards/rejected": 6.5, + "step": 240, + "train_speed(iter/s)": 0.351336 + }, + { + "epoch": 4.8, + "eval_logits/chosen": -1.34375, + "eval_logits/rejected": 0.037109375, + "eval_logps/chosen": -8.3125, + "eval_logps/rejected": -292.0, + "eval_loss": 0.51806640625, + "eval_nll_loss": 0.361328125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.0, + "eval_rewards/margins": 15.1875, + "eval_rewards/rejected": -5.1875, + "eval_runtime": 1.2957, + "eval_samples_per_second": 3.087, + "eval_steps_per_second": 0.772, + "step": 240 + }, + { + "epoch": 4.9, + "grad_norm": 0.4663439808991418, + "learning_rate": 1.0978021666005478e-07, + "logits/chosen": 1.9296875, + "logits/rejected": 0.1611328125, + "logps/chosen": -548.0, + "logps/rejected": -624.0, + "loss": 0.4468417167663574, + "memory(GiB)": 56.51, + "nll_loss": 0.57421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 22.75, + "rewards/margins": 20.0, + "rewards/rejected": 2.78125, + "step": 245, + "train_speed(iter/s)": 0.350845 + }, + { + "epoch": 5.0, + "grad_norm": 0.3211848914019023, + "learning_rate": 0.0, + "logits/chosen": 1.609375, + "logits/rejected": 0.6953125, + "logps/chosen": -398.0, + "logps/rejected": -464.0, + "loss": 0.4753856658935547, + "memory(GiB)": 56.51, + "nll_loss": 0.466796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.875, + "rewards/margins": 20.125, + "rewards/rejected": 1.78125, + "step": 250, + "train_speed(iter/s)": 0.351443 + }, + { + "epoch": 5.0, + "eval_logits/chosen": -1.3515625, + "eval_logits/rejected": 0.0341796875, + "eval_logps/chosen": -8.125, + "eval_logps/rejected": -292.0, + "eval_loss": 0.51513671875, + "eval_nll_loss": 0.353515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.0625, + "eval_rewards/margins": 15.25, + "eval_rewards/rejected": -5.1875, + "eval_runtime": 1.2604, + "eval_samples_per_second": 3.174, + "eval_steps_per_second": 0.793, + "step": 250 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 219416242290688.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/training_args.bin b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..475462e40417630252a03e5a9352b5e11f000ed1 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:647dcffabf1a4c913f292b185978f2f179c4819a3df809536d284c803d9d97fe +size 8888 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/zero_to_fp32.py b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/README.md b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/README.md new file mode 100644 index 0000000000000000000000000000000000000000..525e0a2a5243ae77491e227cbf74fa8ff0a43579 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-7b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/adapter_config.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..454fa4ee7fc180fe11671ee7f6c90d0ba44328a6 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "gate_proj", + "up_proj", + "o_proj", + "v_proj", + "down_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/adapter_model.safetensors b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..319ac555e9024ee2d26cafb9b463b0b3b22d7bc3 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7cdc5cf9a11d32e3d4e26fd74721abe48abbac3074c706a1c5f03c7aeaa2c95a +size 40422208 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/additional_config.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/args.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/args.json new file mode 100644 index 0000000000000000000000000000000000000000..4b016b97ef2d4589dae9d4623f25925cb829d241 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-7b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-7b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..40dc97af0f7acac1009a1a3cd0ae9ab2bf1873c1 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:083e8671ca6f0f38b18c7cc0c5ba00612e5d9b0cf5cb1e89642429fe44df07e2 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ac0dbf8608fb76e54b64959467f29deddf82f77a --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4489f21a5c3addbb3a26b37e19dc42fb2f972ca0b07211d220c8b6f739edced +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ff9893d04e765db61894e179794f1a066c5227a9 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d7e93599d361603debde8ebfc1a49539547e684d37445c44e1e066e051308c4 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef216e1cd9e45fb540f7a7c9fa338229f0ba31c7 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf161e418968d956955d6810f3313c310c1838a60f7b4080ae60e933e21127dc +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a9da647fe4da25a5f3758e5443629d9bff6ec453 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cf770d34e6513b13dadae7681e00f32b9ca82e3d35b0b90573e4162bbee80bc +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0ecbfd25edef0f0b1fbfe102f1b45c68028f4e6e --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1151376896ab0e28f7bef497048dd71a9e315c04c1eb1914775fe375b7bfa204 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a6837afa6ffb1e8753ac7ad0559dba28df49645 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1e1a9c5b900af102f551280368e0a83238ea216a8bacc71d32b6108ee953238 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..25ef950b32757db0ad66317cf689bc5b7cd007e8 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adc6ce2c7c4bbb2fb82603155d6a788fd46cc556d765b6adcd3f553dc37d4914 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..08bcae76bc0b674177e6ca38779953ced14fdeb8 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc5e762d01477841adca18cdb0ad695c823b3a2b77ccfc940ca3c35845b85e8c +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c39104696e65b0a2cd9a1bbe5f84ae9d926466f5 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c9dd4ec5d7e1c8f7847998a9047027bd5c3feef400c61cdad017c5683ad6311 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..09313d283987a8853e57652747441c298c02d066 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87af20de5630bb83514f9b9ca4e73728a6433670be848ab45ce981924ed8ac82 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2d9f7638f3e72d6bcbb2537d6b6f62d967e0e6e6 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90dccc2eedf8b0a194f2a9f1f4d0da6e825f694d240aa0f76aacf7f80bc318b9 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d63150aaaff693966b8830db9918fed56d73ce3d --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:589d1b2ef8b9e56e87c381141d9b96db6cabe219902289f6ae38dc9b1118685d +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..af3830f1d91d9078fc1ea035c19df54aefa56877 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0e477f49629e1e9b1ef6e7bfac380c3f020bbd6da6458e50016258d5e5bc302 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6215175823ccf9da36f3b5d115782bf285d18b18 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e851605536ee44ef4dad5acbda2cc4e94f8182fcad7c13d2c8449558d38d6ea5 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6d62b5d3881062bafa1febf18d9a0adddc4f9f94 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/global_step40/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4b08247c6cef2e9d0fccee2a8b735da9c2688906da346ca739baff797d8e3a0 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/latest b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/latest new file mode 100644 index 0000000000000000000000000000000000000000..8631ab8ddebf60eb3e7f5f2c2b1a2da8298a43c3 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/latest @@ -0,0 +1 @@ +global_step40 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/rng_state_0.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..35252259eb09d8de259231f63f19e786e44bc7b0 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08282b46825aa78d10fe10e3fea89555c5b5a691b261a3ddfd58fcb58370edff +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/rng_state_1.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..eeb7d8df6ed170dd98dba8737bc9dd038af61afd --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbab71d98a3a9a92df82a6bba463947327c3a1bcf35cd9f4f46114641fc42dd9 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/rng_state_2.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..e144a445ffd57fbb5be9b5131f17149bde6c4ff5 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:caac82d57d878d30219a4f9ec289a97ff90c53afc160b968f251b3fd3454b8d8 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/rng_state_3.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..a10f35268ac2a0cb68abc0b78ba5b150b0f29d78 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19762d2d370222b01817da11bbaa6665d542293373186d66f754e7246bb861ed +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/rng_state_4.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..e6ef21562e384e0889ec2400e8f84b6b0bc59035 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00c7508b346a7d3c5c23392845f1d013331114ade778794b76e919cb3ed5d33e +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/rng_state_5.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..126662e5f97dd3f9cd7fb87e70843d3d5532dde3 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b89de7d14dd20a191f56b74c816ef8b7fe5c171e31efbeadbf321c4539ed68c3 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/rng_state_6.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d4e6b27cc99b0fa8e6bbf967892f9304b444d81d --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c71152053553e6e22d670fbc4fd7550bf8a046b54cad7b71869787986a6a42c +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/rng_state_7.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..14e214a61e5311f2b7edf2200ec0365ed2dcc5e1 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b67db12a26a26ffe03d9afc84a43857eb2e5b2fec2dd189653b415f74208190 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/scheduler.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f899c511136dcc56c12c5058062d17686812d1b5 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ab67fe2c6ce9ef0b836ccb0828d5bb1b593055af205b40af49c85139e46a6c8 +size 1064 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/trainer_state.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c120033e268a40ee51b4158e7f2331af6fb23301 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/trainer_state.json @@ -0,0 +1,229 @@ +{ + "best_metric": 0.52832031, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40", + "epoch": 0.8, + "eval_steps": 20, + "global_step": 40, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 9.15866064284304, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": 2.390625, + "logits/rejected": -2.546875, + "logps/chosen": -520.0, + "logps/rejected": -142.0, + "loss": 2.4697265625, + "memory(GiB)": 8.4, + "nll_loss": 0.51171875, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.126463 + }, + { + "epoch": 0.1, + "grad_norm": 26.585759281524314, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 0.1533203125, + "logits/rejected": -0.8671875, + "logps/chosen": -728.0, + "logps/rejected": -304.0, + "loss": 2.3878173828125, + "memory(GiB)": 12.04, + "nll_loss": 1.1484375, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.125, + "rewards/margins": -0.0250244140625, + "rewards/rejected": 0.150390625, + "step": 5, + "train_speed(iter/s)": 0.276117 + }, + { + "epoch": 0.2, + "grad_norm": 8.664467296577401, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.419921875, + "logits/rejected": 1.2890625, + "logps/chosen": -644.0, + "logps/rejected": -572.0, + "loss": 2.18505859375, + "memory(GiB)": 17.87, + "nll_loss": 1.4921875, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.1103515625, + "rewards/margins": 0.08984375, + "rewards/rejected": 0.0201416015625, + "step": 10, + "train_speed(iter/s)": 0.313547 + }, + { + "epoch": 0.3, + "grad_norm": 6.325430646213694, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.7578125, + "logits/rejected": 1.8203125, + "logps/chosen": -924.0, + "logps/rejected": -1552.0, + "loss": 1.76845703125, + "memory(GiB)": 32.02, + "nll_loss": 1.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.75, + "rewards/margins": 2.390625, + "rewards/rejected": 0.359375, + "step": 15, + "train_speed(iter/s)": 0.318522 + }, + { + "epoch": 0.4, + "grad_norm": 2.351010309283972, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": 0.5859375, + "logits/rejected": -0.4375, + "logps/chosen": -432.0, + "logps/rejected": -324.0, + "loss": 1.172265625, + "memory(GiB)": 32.02, + "nll_loss": 0.6640625, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 4.65625, + "rewards/margins": 1.5546875, + "rewards/rejected": 3.09375, + "step": 20, + "train_speed(iter/s)": 0.331194 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.3125, + "eval_logits/rejected": 0.86328125, + "eval_logps/chosen": -14.3125, + "eval_logps/rejected": -182.0, + "eval_loss": 0.6728515625, + "eval_nll_loss": 0.62109375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.375, + "eval_rewards/margins": 3.5625, + "eval_rewards/rejected": 5.8125, + "eval_runtime": 1.2862, + "eval_samples_per_second": 3.11, + "eval_steps_per_second": 0.778, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.5984307422346672, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": 0.255859375, + "logits/rejected": -0.0390625, + "logps/chosen": -326.0, + "logps/rejected": -420.0, + "loss": 0.8047607421875, + "memory(GiB)": 32.02, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.125, + "rewards/margins": 5.40625, + "rewards/rejected": 3.6875, + "step": 25, + "train_speed(iter/s)": 0.330795 + }, + { + "epoch": 0.6, + "grad_norm": 0.5223734060204478, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -0.21875, + "logits/rejected": 1.546875, + "logps/chosen": -400.0, + "logps/rejected": -656.0, + "loss": 0.6901611328125, + "memory(GiB)": 32.02, + "nll_loss": 0.78125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.5625, + "rewards/margins": 6.78125, + "rewards/rejected": 3.765625, + "step": 30, + "train_speed(iter/s)": 0.339407 + }, + { + "epoch": 0.7, + "grad_norm": 1.0839186250481967, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": 1.5625, + "logits/rejected": -1.578125, + "logps/chosen": -592.0, + "logps/rejected": -322.0, + "loss": 0.7055419921875, + "memory(GiB)": 32.02, + "nll_loss": 0.96875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.75, + "rewards/margins": 9.625, + "rewards/rejected": 3.125, + "step": 35, + "train_speed(iter/s)": 0.346622 + }, + { + "epoch": 0.8, + "grad_norm": 13.025539300060226, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": 0.08740234375, + "logits/rejected": 0.71484375, + "logps/chosen": -225.0, + "logps/rejected": -322.0, + "loss": 0.6363037109375, + "memory(GiB)": 32.02, + "nll_loss": 0.56640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.0625, + "rewards/margins": 7.53125, + "rewards/rejected": 3.546875, + "step": 40, + "train_speed(iter/s)": 0.345794 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -0.8046875, + "eval_logits/rejected": 0.9375, + "eval_logps/chosen": -7.1875, + "eval_logps/rejected": -212.0, + "eval_loss": 0.5283203125, + "eval_nll_loss": 0.3125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 7.3125, + "eval_rewards/rejected": 2.796875, + "eval_runtime": 1.2905, + "eval_samples_per_second": 3.1, + "eval_steps_per_second": 0.775, + "step": 40 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 35224365432832.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/training_args.bin b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..475462e40417630252a03e5a9352b5e11f000ed1 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:647dcffabf1a4c913f292b185978f2f179c4819a3df809536d284c803d9d97fe +size 8888 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/zero_to_fp32.py b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-40/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/README.md b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/README.md new file mode 100644 index 0000000000000000000000000000000000000000..525e0a2a5243ae77491e227cbf74fa8ff0a43579 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-7b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/adapter_config.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..454fa4ee7fc180fe11671ee7f6c90d0ba44328a6 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "gate_proj", + "up_proj", + "o_proj", + "v_proj", + "down_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/adapter_model.safetensors b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..739b9e7ba6ce22e3bbbf98269407ec5425aa2c76 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30b5b0f1392136e9b615a407d2f3faf94cf1a81296af6f181a44477425158183 +size 40422208 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/additional_config.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/args.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/args.json new file mode 100644 index 0000000000000000000000000000000000000000..4b016b97ef2d4589dae9d4623f25925cb829d241 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-7b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-7b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..876c717c002f04276e9eca6ffa8a35dbd5db21b9 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1f2b3a6e640c5bb44eea5ac41600fc342a8d959305697c1ebfa4a417f1f3908 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b31791466145f0bf0cb6444dd7ecf72dbeebea3 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d638a806958712f6aa782a44866ddd4b3231c4094174eb02d8b202923037a7c +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ca4e6300da315cfec23d28748a06dd83dc9da5b3 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9346efba472cc983349be19f742858a69c7dcf4adea7d84274dc81357cf5c39 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9e791f6e579fa94a6e3f665c3e42a20cb894c4a7 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ae1e27ea1a79f9371a3570dc2272acd0dde079baa3481d0a76bc8c1c8eac0ad +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..80307540a05eb16e1602398c32b25732c1cb05bb --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c60b77da39480d711ce31a73b2143ae2b8ab5cf7da9fb834b7e729d1360b16b0 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..03bf9c33f48691109d1cc6ff9021540d49413661 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:869b4d8e44ec0fe1580451d10e305f571104cd9e81199792cb8550dd1378fc00 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..da91a70ccfc8c995eb192b15f2325a4511ae907c --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e12d9637bcd157f358ecd824d335bb19010b119cfc200440f68f082fabbe3bf3 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1ac2aa43f61f08f4cbad6c1c9f65e777099f3635 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44db0ec838c00f32c04002c4b64afd4698f2c3e919fc55aee209fa22cfba94ea +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..105bc04962496f57ebd47ec584f1aa459ab271f9 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f31518bae8f95e1a71abdb7bea54c5f4786b1611b4e0b78c4f4105863ab94661 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..39f64c83a2c1cab0cde9f3eff6991698707b50a5 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7da05cbcae38c3c83dcd7743d540e5e504a94cc9d2cc102b1d37947ef301fc0 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f6d6cc746f3fd25e6902d34df447756172e1db73 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43fc134d8b9b1495c1295ddfd1dc0bed2361e9941519a23ed4764c96567fa024 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f17912ec1ddf7d75dc98a4496ee8947d51899334 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:feb8f82225896a3d0ab0f12ca940e95efb35d356c385d2fbe65c05ae95ba89d8 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..75195444993ce0fef6c83c6d68bd6ddafaf9f3b4 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9594125ae021cf6fb846d5b6c8aef00b74c8e4019c27d2f6bc79b9ddc5ab2df3 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..752fc6adf29ac8a61ca19e73687646c88c7fda7a --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5acbd78d212e1d5ef3044dd260d2feb68757d6dbad68dabd00957a3be6d5d6b4 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9e7b04938259cddd30955b868ae27a95ea19b7bb --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b9e3c5c693725c04851f78fc21f92a9ceeaf9b4a59d1216d043d57dfc9aa1c0 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..57ce614748c604b7aedcf44b0fb63827188bd399 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/global_step60/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a2510329223e1dcd355a312b91a7eb9ad3865b7e34dbd7f0631db9a6a9a4d16 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/latest b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/latest new file mode 100644 index 0000000000000000000000000000000000000000..6dac34b840ecfb636ba8ab1e4da79fa1bdc8c3d4 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/latest @@ -0,0 +1 @@ +global_step60 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/rng_state_0.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..9d31438b0bfd38acb69501aeb325fee7751b84e8 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a0ef6f96a48e59aa52c4b471312c2a62378c19acc7ebbae839612b03a7d775a +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/rng_state_1.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..6759906b0863c54055155658e8d374770ecfc5f9 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab11d533c0fdad46ea8b8e295ba5fdb705e078eeb88cc28f37d82913508766e9 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/rng_state_2.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..784c719c042a2cca1f38818c7e9638aab398c859 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:615c168147e3465ce5bfab6da2ff4afc68566ce00ec0f0c6c9fc988038a58d0a +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/rng_state_3.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..b32e705bcb6afbb2ab95f5c68c07d0ccc3d457df --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79f71e8f8674ecaef9f8cdcbf7ac457a8b8ff15b12694ba2a2fffcb4b43f0f08 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/rng_state_4.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..37b97b2d3011e43a6dbac487263b52a0b3a55c83 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88cf6d674dab5545c300a55135f08ca935730a3d35e2c419fb0b333f19482c19 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/rng_state_5.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..1acb3d3b1d3de061b997d1dee57e44b465d0630e --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2754f2cd8824702f027870d93748b3c0491b0ecd30f1e3d8e937116b2be6151f +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/rng_state_6.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..7760bbbcd6d3754ac81a5218adb6e0cd8036905b --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1385124ac55604598f45ea6e2d141f29456647d3e7c10d12ca64ec93d312be8d +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/rng_state_7.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..d8c4407057d0cb21c08140413cb320528190a941 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:416538efaec7391fa8fe782fb15146b83e5612d9e1961292c34c53e964806873 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/scheduler.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..728c3241a49cbd920d5df86255fc8be4d97c5519 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fa3ba485fff4300fd9029c17ba92c92630af852a00df0a0e8d16c233f74cbc8 +size 1064 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/trainer_state.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..76c7340c4b12919d78b7e505976b0feaf5530adc --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/trainer_state.json @@ -0,0 +1,318 @@ +{ + "best_metric": 0.51806641, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60", + "epoch": 1.2, + "eval_steps": 20, + "global_step": 60, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 9.15866064284304, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": 2.390625, + "logits/rejected": -2.546875, + "logps/chosen": -520.0, + "logps/rejected": -142.0, + "loss": 2.4697265625, + "memory(GiB)": 8.4, + "nll_loss": 0.51171875, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.126463 + }, + { + "epoch": 0.1, + "grad_norm": 26.585759281524314, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 0.1533203125, + "logits/rejected": -0.8671875, + "logps/chosen": -728.0, + "logps/rejected": -304.0, + "loss": 2.3878173828125, + "memory(GiB)": 12.04, + "nll_loss": 1.1484375, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.125, + "rewards/margins": -0.0250244140625, + "rewards/rejected": 0.150390625, + "step": 5, + "train_speed(iter/s)": 0.276117 + }, + { + "epoch": 0.2, + "grad_norm": 8.664467296577401, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.419921875, + "logits/rejected": 1.2890625, + "logps/chosen": -644.0, + "logps/rejected": -572.0, + "loss": 2.18505859375, + "memory(GiB)": 17.87, + "nll_loss": 1.4921875, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.1103515625, + "rewards/margins": 0.08984375, + "rewards/rejected": 0.0201416015625, + "step": 10, + "train_speed(iter/s)": 0.313547 + }, + { + "epoch": 0.3, + "grad_norm": 6.325430646213694, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.7578125, + "logits/rejected": 1.8203125, + "logps/chosen": -924.0, + "logps/rejected": -1552.0, + "loss": 1.76845703125, + "memory(GiB)": 32.02, + "nll_loss": 1.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.75, + "rewards/margins": 2.390625, + "rewards/rejected": 0.359375, + "step": 15, + "train_speed(iter/s)": 0.318522 + }, + { + "epoch": 0.4, + "grad_norm": 2.351010309283972, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": 0.5859375, + "logits/rejected": -0.4375, + "logps/chosen": -432.0, + "logps/rejected": -324.0, + "loss": 1.172265625, + "memory(GiB)": 32.02, + "nll_loss": 0.6640625, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 4.65625, + "rewards/margins": 1.5546875, + "rewards/rejected": 3.09375, + "step": 20, + "train_speed(iter/s)": 0.331194 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.3125, + "eval_logits/rejected": 0.86328125, + "eval_logps/chosen": -14.3125, + "eval_logps/rejected": -182.0, + "eval_loss": 0.6728515625, + "eval_nll_loss": 0.62109375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.375, + "eval_rewards/margins": 3.5625, + "eval_rewards/rejected": 5.8125, + "eval_runtime": 1.2862, + "eval_samples_per_second": 3.11, + "eval_steps_per_second": 0.778, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.5984307422346672, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": 0.255859375, + "logits/rejected": -0.0390625, + "logps/chosen": -326.0, + "logps/rejected": -420.0, + "loss": 0.8047607421875, + "memory(GiB)": 32.02, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.125, + "rewards/margins": 5.40625, + "rewards/rejected": 3.6875, + "step": 25, + "train_speed(iter/s)": 0.330795 + }, + { + "epoch": 0.6, + "grad_norm": 0.5223734060204478, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -0.21875, + "logits/rejected": 1.546875, + "logps/chosen": -400.0, + "logps/rejected": -656.0, + "loss": 0.6901611328125, + "memory(GiB)": 32.02, + "nll_loss": 0.78125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.5625, + "rewards/margins": 6.78125, + "rewards/rejected": 3.765625, + "step": 30, + "train_speed(iter/s)": 0.339407 + }, + { + "epoch": 0.7, + "grad_norm": 1.0839186250481967, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": 1.5625, + "logits/rejected": -1.578125, + "logps/chosen": -592.0, + "logps/rejected": -322.0, + "loss": 0.7055419921875, + "memory(GiB)": 32.02, + "nll_loss": 0.96875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.75, + "rewards/margins": 9.625, + "rewards/rejected": 3.125, + "step": 35, + "train_speed(iter/s)": 0.346622 + }, + { + "epoch": 0.8, + "grad_norm": 13.025539300060226, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": 0.08740234375, + "logits/rejected": 0.71484375, + "logps/chosen": -225.0, + "logps/rejected": -322.0, + "loss": 0.6363037109375, + "memory(GiB)": 32.02, + "nll_loss": 0.56640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.0625, + "rewards/margins": 7.53125, + "rewards/rejected": 3.546875, + "step": 40, + "train_speed(iter/s)": 0.345794 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -0.8046875, + "eval_logits/rejected": 0.9375, + "eval_logps/chosen": -7.1875, + "eval_logps/rejected": -212.0, + "eval_loss": 0.5283203125, + "eval_nll_loss": 0.3125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 7.3125, + "eval_rewards/rejected": 2.796875, + "eval_runtime": 1.2905, + "eval_samples_per_second": 3.1, + "eval_steps_per_second": 0.775, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.49724108967226127, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": 2.890625, + "logits/rejected": 0.002349853515625, + "logps/chosen": -426.0, + "logps/rejected": -430.0, + "loss": 0.5437744140625, + "memory(GiB)": 32.02, + "nll_loss": 0.470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.0625, + "rewards/margins": 10.6875, + "rewards/rejected": 3.390625, + "step": 45, + "train_speed(iter/s)": 0.342553 + }, + { + "epoch": 1.0, + "grad_norm": 1.4155859018634052, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": 0.2451171875, + "logits/rejected": 1.75, + "logps/chosen": -804.0, + "logps/rejected": -1480.0, + "loss": 0.57803955078125, + "memory(GiB)": 32.02, + "nll_loss": 0.6953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.25, + "rewards/margins": 9.125, + "rewards/rejected": 7.125, + "step": 50, + "train_speed(iter/s)": 0.346029 + }, + { + "epoch": 1.1, + "grad_norm": 0.5694275670231188, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": 1.8359375, + "logits/rejected": 0.703125, + "logps/chosen": -386.0, + "logps/rejected": -612.0, + "loss": 0.6138671875, + "memory(GiB)": 43.66, + "nll_loss": 0.546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.75, + "rewards/margins": 12.25, + "rewards/rejected": 3.53125, + "step": 55, + "train_speed(iter/s)": 0.347089 + }, + { + "epoch": 1.2, + "grad_norm": 0.3472486708219598, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": 1.65625, + "logits/rejected": 2.15625, + "logps/chosen": -233.0, + "logps/rejected": -520.0, + "loss": 0.5795654296875, + "memory(GiB)": 43.66, + "nll_loss": 0.396484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.375, + "rewards/margins": 12.1875, + "rewards/rejected": 3.140625, + "step": 60, + "train_speed(iter/s)": 0.349928 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -0.23828125, + "eval_logits/rejected": 1.703125, + "eval_logps/chosen": -7.3125, + "eval_logps/rejected": -223.0, + "eval_loss": 0.51806640625, + "eval_nll_loss": 0.31640625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 8.4375, + "eval_rewards/rejected": 1.703125, + "eval_runtime": 1.2918, + "eval_samples_per_second": 3.096, + "eval_steps_per_second": 0.774, + "step": 60 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 52404171702272.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/training_args.bin b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..475462e40417630252a03e5a9352b5e11f000ed1 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:647dcffabf1a4c913f292b185978f2f179c4819a3df809536d284c803d9d97fe +size 8888 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/zero_to_fp32.py b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-60/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/README.md b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/README.md new file mode 100644 index 0000000000000000000000000000000000000000..525e0a2a5243ae77491e227cbf74fa8ff0a43579 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/deepseek-r1-7b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/adapter_config.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..454fa4ee7fc180fe11671ee7f6c90d0ba44328a6 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "gate_proj", + "up_proj", + "o_proj", + "v_proj", + "down_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/adapter_model.safetensors b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1b4b1e692f81ebc0d5ec2acfe30bf33d7a878139 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0ad6efcebad1640961be47c5afae989a3b11f67f366bd9cdebc1f6bff4dbbb7 +size 40422208 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/additional_config.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/args.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/args.json new file mode 100644 index 0000000000000000000000000000000000000000..4b016b97ef2d4589dae9d4623f25925cb829d241 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_dpo_0.5_random20.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 5.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-7b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/home/wangruotong/LLM_test/Models/deepseek-r1-7b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/deepseek-r1-7b", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=4, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..72435165703b254725d70c5df89cc51f5ceeea10 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ddddc2c9577cb2021145b573841041e42e4219281de7bd842f3c95c3b1436a8 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..72e0209f73e952c513c4ae01a04f37f209eedbcc --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26937993564b9023e25232b97668368dc3e0ecd594acc053711d28ff6226ea5f +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5eec498e48867fba368e55563c701b4beab67f67 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67267b8e4e8fb27d0185270f6f697ded5f5906eeebec0d3ecdad44a4abba90ef +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e4b55eb702579d4bb19cacecae86da1c59a51a75 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55021a2f4b1ad21b7782bafbbea05b27d6ed46e5a03ba69bcd875297eb1d968a +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0a10b494986176ae5edef7bb2fffc25466f306ca --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbf5650a98ea928ed6aa01282c21e1a8a58db2f74c57b2ba471e24d3baf39841 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1dcfe4bee5d54e4050a91a9a0290247c6bd1f374 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9aaad9da4e9fb88b7a841b944d5c2006a13f59a07e4fbcf76f36f9ed720b460 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..98d53ab759a6791cd4cf4ced29c664b04a5ca197 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ed76b60e3eba8e694c32a6c631ffbac5b010f3a088e1b4a6a88aefa7af76745 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..06a04ef01bbe5ba070aed6465072632fd589f9ac --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d53b8fbc2ca2dfe6d00ea9ede23d390bc5b3ce0cc53f09b8284947db5e9e950b +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c266fcd4638fe47bf0dd5ca695bcef0b4784c49 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:baa449a8e6e8f52b9350a94af00a8858afe117addfafd9094d63074ace05d65b +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c4ca0b0f5edc794cfb4263d46aff6f1cfdf7b53b --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0526594f550cc1735f9c3b6eecba6741e165152424e3dab837f326143ea81935 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..282b7345adf65d441994acb25c7d19b657e5d0e1 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dad5db6a431ade078ddffea891a53ae039382937a99c01e52a4f4a1858188c72 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..02de38e1ae701dd72230ed5df3c2fb9a99876a72 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da428381cff0ca32fb09db2aa672d03b7b0a56a431a89063af0877f412ae148b +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6977eef82cd75d61cd3cba303d465b9932bdd19a --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51c116720c931343a9dd2ca6124fa24ded47ef4691d2d372f46deed4e51dfdf4 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4d03497e2e63e9bcfea373618da0c781cd4f96b1 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:423a914d9ef30986dfc89cc03b735e326ef36067d02d720b36868d2bc59a696c +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e4ffc8a5e41cb1d6e7c1534e8fce1115e75d39c --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3c1e4581df297364ab3c137cb013aebb685d51325d3416980fae1c7da28d369 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f7c15ddc48748027926183571acc4798b32bb08a --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/global_step80/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:241d91b45831d40ce0663d010e9f98cb3d8e17825a8b844bddb43238f9dcd875 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/latest b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/latest new file mode 100644 index 0000000000000000000000000000000000000000..75eab498d0366633484ab40334e4b8fb92b16dad --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/latest @@ -0,0 +1 @@ +global_step80 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/rng_state_0.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..9dd2a62da4ca83b3b986d96dbf0eaeb82207ca93 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0628a9017696045a3a29e9eaffc71e9262d855716e773c0c3be760a1fe85bc8 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/rng_state_1.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..1ba5f3aba4388a582cd47f7f9e57cd5879b1cbd2 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df342004a4d8e3626bf2a9f689fde7c8bfd6d995e14931f5496eda1f456cb6f2 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/rng_state_2.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..27b0f7845c2b9530c3e6ed3ce232ff4e86b86122 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f02096eb4e8850b91490e80e4a042e2e60f71bd2abc6a269d62c271649cb77d2 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/rng_state_3.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..fcfb583fc43c6dd4395671708744cfd18c419970 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:326c778d3d0e7e3d5665fa0a9ecd92986609c430da08b41611d6c05dc19815a8 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/rng_state_4.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..7a8c64b1f15ac655b2be2a42fe61cabe2a877704 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d978dcb0c34e022ee6750e9d86814b8c82e4965d7e07662f35f06eeac12938f3 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/rng_state_5.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..262e8187e6caeca12ef3b0aa923b12afd697e03d --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01e83399aed1d9d173c3e07b2efa8530c956b62b2b68394c2ed0d43bd8bba9d1 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/rng_state_6.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..72f794e31f8d3e0c63972e5076e1ed90c52087ba --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:606ab3ca92e3d20c327c69fdcce7f7e39bec2f2c3538b036088b255f917e3ba4 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/rng_state_7.pth b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..244e7fdaa1cef2e82bd4e16afb10f32f68318bcc --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1276a987dd22c9093fec58921ba19f340a28f18bff635cc01324e09a3c37ac3a +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/scheduler.pt b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2e36a588df493151f57c8f73aa08129a3810c2c7 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bee30cdff92a069fa950619177f737b278c096bc7c83c0e5bdea15a673218022 +size 1064 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/trainer_state.json b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c8a49e683dfec2f8e944bd02d70682ff7d4e35fa --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/trainer_state.json @@ -0,0 +1,407 @@ +{ + "best_metric": 0.50097656, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80", + "epoch": 1.6, + "eval_steps": 20, + "global_step": 80, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 9.15866064284304, + "learning_rate": 7.692307692307694e-06, + "logits/chosen": 2.390625, + "logits/rejected": -2.546875, + "logps/chosen": -520.0, + "logps/rejected": -142.0, + "loss": 2.4697265625, + "memory(GiB)": 8.4, + "nll_loss": 0.51171875, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.126463 + }, + { + "epoch": 0.1, + "grad_norm": 26.585759281524314, + "learning_rate": 3.846153846153846e-05, + "logits/chosen": 0.1533203125, + "logits/rejected": -0.8671875, + "logps/chosen": -728.0, + "logps/rejected": -304.0, + "loss": 2.3878173828125, + "memory(GiB)": 12.04, + "nll_loss": 1.1484375, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.125, + "rewards/margins": -0.0250244140625, + "rewards/rejected": 0.150390625, + "step": 5, + "train_speed(iter/s)": 0.276117 + }, + { + "epoch": 0.2, + "grad_norm": 8.664467296577401, + "learning_rate": 7.692307692307693e-05, + "logits/chosen": -0.419921875, + "logits/rejected": 1.2890625, + "logps/chosen": -644.0, + "logps/rejected": -572.0, + "loss": 2.18505859375, + "memory(GiB)": 17.87, + "nll_loss": 1.4921875, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.1103515625, + "rewards/margins": 0.08984375, + "rewards/rejected": 0.0201416015625, + "step": 10, + "train_speed(iter/s)": 0.313547 + }, + { + "epoch": 0.3, + "grad_norm": 6.325430646213694, + "learning_rate": 9.998242976313776e-05, + "logits/chosen": -0.7578125, + "logits/rejected": 1.8203125, + "logps/chosen": -924.0, + "logps/rejected": -1552.0, + "loss": 1.76845703125, + "memory(GiB)": 32.02, + "nll_loss": 1.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.75, + "rewards/margins": 2.390625, + "rewards/rejected": 0.359375, + "step": 15, + "train_speed(iter/s)": 0.318522 + }, + { + "epoch": 0.4, + "grad_norm": 2.351010309283972, + "learning_rate": 9.97849063861667e-05, + "logits/chosen": 0.5859375, + "logits/rejected": -0.4375, + "logps/chosen": -432.0, + "logps/rejected": -324.0, + "loss": 1.172265625, + "memory(GiB)": 32.02, + "nll_loss": 0.6640625, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 4.65625, + "rewards/margins": 1.5546875, + "rewards/rejected": 3.09375, + "step": 20, + "train_speed(iter/s)": 0.331194 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.3125, + "eval_logits/rejected": 0.86328125, + "eval_logps/chosen": -14.3125, + "eval_logps/rejected": -182.0, + "eval_loss": 0.6728515625, + "eval_nll_loss": 0.62109375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.375, + "eval_rewards/margins": 3.5625, + "eval_rewards/rejected": 5.8125, + "eval_runtime": 1.2862, + "eval_samples_per_second": 3.11, + "eval_steps_per_second": 0.778, + "step": 20 + }, + { + "epoch": 0.5, + "grad_norm": 0.5984307422346672, + "learning_rate": 9.936876709681668e-05, + "logits/chosen": 0.255859375, + "logits/rejected": -0.0390625, + "logps/chosen": -326.0, + "logps/rejected": -420.0, + "loss": 0.8047607421875, + "memory(GiB)": 32.02, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.125, + "rewards/margins": 5.40625, + "rewards/rejected": 3.6875, + "step": 25, + "train_speed(iter/s)": 0.330795 + }, + { + "epoch": 0.6, + "grad_norm": 0.5223734060204478, + "learning_rate": 9.873583924954152e-05, + "logits/chosen": -0.21875, + "logits/rejected": 1.546875, + "logps/chosen": -400.0, + "logps/rejected": -656.0, + "loss": 0.6901611328125, + "memory(GiB)": 32.02, + "nll_loss": 0.78125, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.5625, + "rewards/margins": 6.78125, + "rewards/rejected": 3.765625, + "step": 30, + "train_speed(iter/s)": 0.339407 + }, + { + "epoch": 0.7, + "grad_norm": 1.0839186250481967, + "learning_rate": 9.788890216258939e-05, + "logits/chosen": 1.5625, + "logits/rejected": -1.578125, + "logps/chosen": -592.0, + "logps/rejected": -322.0, + "loss": 0.7055419921875, + "memory(GiB)": 32.02, + "nll_loss": 0.96875, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.75, + "rewards/margins": 9.625, + "rewards/rejected": 3.125, + "step": 35, + "train_speed(iter/s)": 0.346622 + }, + { + "epoch": 0.8, + "grad_norm": 13.025539300060226, + "learning_rate": 9.68316749134364e-05, + "logits/chosen": 0.08740234375, + "logits/rejected": 0.71484375, + "logps/chosen": -225.0, + "logps/rejected": -322.0, + "loss": 0.6363037109375, + "memory(GiB)": 32.02, + "nll_loss": 0.56640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.0625, + "rewards/margins": 7.53125, + "rewards/rejected": 3.546875, + "step": 40, + "train_speed(iter/s)": 0.345794 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -0.8046875, + "eval_logits/rejected": 0.9375, + "eval_logps/chosen": -7.1875, + "eval_logps/rejected": -212.0, + "eval_loss": 0.5283203125, + "eval_nll_loss": 0.3125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 7.3125, + "eval_rewards/rejected": 2.796875, + "eval_runtime": 1.2905, + "eval_samples_per_second": 3.1, + "eval_steps_per_second": 0.775, + "step": 40 + }, + { + "epoch": 0.9, + "grad_norm": 0.49724108967226127, + "learning_rate": 9.55688000075414e-05, + "logits/chosen": 2.890625, + "logits/rejected": 0.002349853515625, + "logps/chosen": -426.0, + "logps/rejected": -430.0, + "loss": 0.5437744140625, + "memory(GiB)": 32.02, + "nll_loss": 0.470703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.0625, + "rewards/margins": 10.6875, + "rewards/rejected": 3.390625, + "step": 45, + "train_speed(iter/s)": 0.342553 + }, + { + "epoch": 1.0, + "grad_norm": 1.4155859018634052, + "learning_rate": 9.410582299213573e-05, + "logits/chosen": 0.2451171875, + "logits/rejected": 1.75, + "logps/chosen": -804.0, + "logps/rejected": -1480.0, + "loss": 0.57803955078125, + "memory(GiB)": 32.02, + "nll_loss": 0.6953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.25, + "rewards/margins": 9.125, + "rewards/rejected": 7.125, + "step": 50, + "train_speed(iter/s)": 0.346029 + }, + { + "epoch": 1.1, + "grad_norm": 0.5694275670231188, + "learning_rate": 9.244916810456821e-05, + "logits/chosen": 1.8359375, + "logits/rejected": 0.703125, + "logps/chosen": -386.0, + "logps/rejected": -612.0, + "loss": 0.6138671875, + "memory(GiB)": 43.66, + "nll_loss": 0.546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.75, + "rewards/margins": 12.25, + "rewards/rejected": 3.53125, + "step": 55, + "train_speed(iter/s)": 0.347089 + }, + { + "epoch": 1.2, + "grad_norm": 0.3472486708219598, + "learning_rate": 9.060611006213832e-05, + "logits/chosen": 1.65625, + "logits/rejected": 2.15625, + "logps/chosen": -233.0, + "logps/rejected": -520.0, + "loss": 0.5795654296875, + "memory(GiB)": 43.66, + "nll_loss": 0.396484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.375, + "rewards/margins": 12.1875, + "rewards/rejected": 3.140625, + "step": 60, + "train_speed(iter/s)": 0.349928 + }, + { + "epoch": 1.2, + "eval_logits/chosen": -0.23828125, + "eval_logits/rejected": 1.703125, + "eval_logps/chosen": -7.3125, + "eval_logps/rejected": -223.0, + "eval_loss": 0.51806640625, + "eval_nll_loss": 0.31640625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.125, + "eval_rewards/margins": 8.4375, + "eval_rewards/rejected": 1.703125, + "eval_runtime": 1.2918, + "eval_samples_per_second": 3.096, + "eval_steps_per_second": 0.774, + "step": 60 + }, + { + "epoch": 1.3, + "grad_norm": 0.48830819519223906, + "learning_rate": 8.858474211729469e-05, + "logits/chosen": 0.97265625, + "logits/rejected": 1.9921875, + "logps/chosen": -346.0, + "logps/rejected": -844.0, + "loss": 0.6193359375, + "memory(GiB)": 43.66, + "nll_loss": 0.640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.5625, + "rewards/margins": 11.875, + "rewards/rejected": 3.703125, + "step": 65, + "train_speed(iter/s)": 0.34619 + }, + { + "epoch": 1.4, + "grad_norm": 0.3461001279478362, + "learning_rate": 8.639394051847472e-05, + "logits/chosen": -0.6875, + "logits/rejected": 2.28125, + "logps/chosen": -88.0, + "logps/rejected": -1120.0, + "loss": 0.5681396484375, + "memory(GiB)": 43.66, + "nll_loss": 0.84375, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 11.5625, + "rewards/margins": 9.125, + "rewards/rejected": 2.484375, + "step": 70, + "train_speed(iter/s)": 0.349389 + }, + { + "epoch": 1.5, + "grad_norm": 0.49738864814975586, + "learning_rate": 8.404332553264547e-05, + "logits/chosen": 2.515625, + "logits/rejected": 0.263671875, + "logps/chosen": -462.0, + "logps/rejected": -434.0, + "loss": 0.517205810546875, + "memory(GiB)": 43.66, + "nll_loss": 0.46875, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.625, + "rewards/margins": 16.375, + "rewards/rejected": 2.28125, + "step": 75, + "train_speed(iter/s)": 0.351345 + }, + { + "epoch": 1.6, + "grad_norm": 0.3498527388541882, + "learning_rate": 8.154321920070414e-05, + "logits/chosen": 3.5625, + "logits/rejected": -1.296875, + "logps/chosen": -580.0, + "logps/rejected": -132.0, + "loss": 0.5629638671875, + "memory(GiB)": 43.66, + "nll_loss": 0.703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 21.5, + "rewards/margins": 15.6875, + "rewards/rejected": 5.84375, + "step": 80, + "train_speed(iter/s)": 0.353922 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -0.68359375, + "eval_logits/rejected": 0.3125, + "eval_logps/chosen": -6.3125, + "eval_logps/rejected": -306.0, + "eval_loss": 0.5009765625, + "eval_nll_loss": 0.2734375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.1875, + "eval_rewards/margins": 16.75, + "eval_rewards/rejected": -6.59375, + "eval_runtime": 1.3254, + "eval_samples_per_second": 3.018, + "eval_steps_per_second": 0.754, + "step": 80 + } + ], + "logging_steps": 5, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 69561268436992.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/training_args.bin b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..475462e40417630252a03e5a9352b5e11f000ed1 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:647dcffabf1a4c913f292b185978f2f179c4819a3df809536d284c803d9d97fe +size 8888 diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/zero_to_fp32.py b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-80/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_logits_chosen.png b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_logits_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..16a5a8111f02980cda09d73ce90648a18f0af5dc Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_logits_chosen.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_logits_rejected.png b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_logits_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..d544912aebd20a1199bc202c2b2ac2e4c72bd2a5 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_logits_rejected.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_logps_chosen.png b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_logps_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..c23a7b76a7580432036fbeefa15c29564236e3e7 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_logps_chosen.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_logps_rejected.png b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_logps_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..538e231a5ac65753d996637dc484b2cb88f9ed6c Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_logps_rejected.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_loss.png b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..b2644798ed52e18df30cb36fe40928ef34da0318 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_loss.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_nll_loss.png b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_nll_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..46bb8a9b3b66624cdb177e9bd7f33610cf5d8cd3 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_nll_loss.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_rewards_accuracies.png b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_rewards_accuracies.png new file mode 100644 index 0000000000000000000000000000000000000000..bcd3f332a0a9276590b2d2aad6821619831dd64b Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_rewards_accuracies.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_rewards_chosen.png b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_rewards_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..f786250ae61a0d24fb2af4bbd7e014e3cabb2ae1 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_rewards_chosen.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_rewards_margins.png b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_rewards_margins.png new file mode 100644 index 0000000000000000000000000000000000000000..ba8aa00c80fcd4cbe6e7d8b7f2d9d5cd7a485577 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_rewards_margins.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_rewards_rejected.png b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_rewards_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..1bb48922b8756912ff7ccd58b24fc9ad877842c7 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_rewards_rejected.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_runtime.png b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..6c30e893e7284f08b5e944707aa0c34a6f108264 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_runtime.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_samples_per_second.png b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..de91c50c960c27c611db25bb859cbc1e211ae509 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_samples_per_second.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_steps_per_second.png b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..a0fd0cb59a6cd0c9d77ba59bd8b05bcdf5eb01b1 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/eval_steps_per_second.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_epoch.png b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_epoch.png new file mode 100644 index 0000000000000000000000000000000000000000..97933611536b8acda691b8294659ee4282ded3e6 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_epoch.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_grad_norm.png b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..d4972cd6c2a21912e1da2ae9e26621d5e85e531e Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_grad_norm.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_learning_rate.png b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_learning_rate.png new file mode 100644 index 0000000000000000000000000000000000000000..f869fa37203441700940366c27c7d20493a62bae Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_learning_rate.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_logits_chosen.png b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_logits_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..29e0a29ea59cc0992192c60b16a9d5febd911164 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_logits_chosen.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_logits_rejected.png b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_logits_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..1d17174641df190caa71b9cab7d72a405846c34d Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_logits_rejected.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_logps_chosen.png b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_logps_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..26ef372e3da19a25782eb71aa6d647ecbb934f6d Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_logps_chosen.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_logps_rejected.png b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_logps_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..ecc80bcc689d396394af5c309d3deb10fe116bef Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_logps_rejected.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_loss.png b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..67c857702d4ae6a87b67c5ebd54ba7573c64c409 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_loss.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_memory(GiB).png b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_memory(GiB).png new file mode 100644 index 0000000000000000000000000000000000000000..25ee94b5c14dad9f5452c332f01362df3b1097fe Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_memory(GiB).png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_nll_loss.png b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_nll_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..8ccc96433f9086ab7ec41c493c304eadcea34d16 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_nll_loss.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_rewards_accuracies.png b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_rewards_accuracies.png new file mode 100644 index 0000000000000000000000000000000000000000..bd3fae85bdcd57b33807eb6cc5562978de8d921e Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_rewards_accuracies.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_rewards_chosen.png b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_rewards_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..8d562f729f689601c8a948664b1217c3e374d631 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_rewards_chosen.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_rewards_margins.png b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_rewards_margins.png new file mode 100644 index 0000000000000000000000000000000000000000..e29f205e63d6e019b9844cc3b927d0ae54f58d22 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_rewards_margins.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_rewards_rejected.png b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_rewards_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..8c689ce16cc7815370add32ca11c48703e1187cc Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_rewards_rejected.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_total_flos.png b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_total_flos.png new file mode 100644 index 0000000000000000000000000000000000000000..fbd4dd667db93fbf560379087e453fda3826c6fe Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_total_flos.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_train_loss.png b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..a302ccaa5e9765e62e373669df85668040b9ba13 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_train_loss.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_train_runtime.png b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_train_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..02fbfaa65bac59ae1c916b84a7efda3905c8658b Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_train_runtime.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_train_samples_per_second.png b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_train_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..216dfafb7c21f95e1a3aa5fc84500bf71a70737c Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_train_samples_per_second.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_train_speed(iter_s).png b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_train_speed(iter_s).png new file mode 100644 index 0000000000000000000000000000000000000000..8e72093817f92307d4c94af3d7d107abf471439b Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_train_speed(iter_s).png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_train_steps_per_second.png b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_train_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..584fccc9f6f0f03ce813ed52b29753b851b4a7b8 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/images/train_train_steps_per_second.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/logging.jsonl b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/logging.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cef2397936c9c84f236b4de8e3331757acdade57 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/logging.jsonl @@ -0,0 +1,66 @@ +{"loss": 2.46972656, "grad_norm": 9.15866064, "learning_rate": 7.69e-06, "memory(GiB)": 8.4, "train_speed(iter/s)": 0.126463, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/rejected": -142.0, "logps/chosen": -520.0, "logits/rejected": -2.546875, "logits/chosen": 2.390625, "nll_loss": 0.51171875, "epoch": 0.02, "global_step/max_steps": "1/250", "percentage": "0.40%", "elapsed_time": "4s", "remaining_time": "20m 20s"} +{"loss": 2.38781738, "grad_norm": 26.58575928, "learning_rate": 3.846e-05, "memory(GiB)": 12.04, "train_speed(iter/s)": 0.276117, "rewards/chosen": 0.125, "rewards/rejected": 0.15039062, "rewards/accuracies": 0.25, "rewards/margins": -0.02502441, "logps/rejected": -304.0, "logps/chosen": -728.0, "logits/rejected": -0.8671875, "logits/chosen": 0.15332031, "nll_loss": 1.1484375, "epoch": 0.1, "global_step/max_steps": "5/250", "percentage": "2.00%", "elapsed_time": "15s", "remaining_time": "12m 20s"} +{"loss": 2.18505859, "grad_norm": 8.6644673, "learning_rate": 7.692e-05, "memory(GiB)": 17.87, "train_speed(iter/s)": 0.313547, "rewards/chosen": 0.11035156, "rewards/rejected": 0.0201416, "rewards/accuracies": 0.60000002, "rewards/margins": 0.08984375, "logps/rejected": -572.0, "logps/chosen": -644.0, "logits/rejected": 1.2890625, "logits/chosen": -0.41992188, "nll_loss": 1.4921875, "epoch": 0.2, "global_step/max_steps": "10/250", "percentage": "4.00%", "elapsed_time": "28s", "remaining_time": "11m 33s"} +{"loss": 1.76845703, "grad_norm": 6.32543065, "learning_rate": 9.998e-05, "memory(GiB)": 32.02, "train_speed(iter/s)": 0.318522, "rewards/chosen": 2.75, "rewards/rejected": 0.359375, "rewards/accuracies": 1.0, "rewards/margins": 2.390625, "logps/rejected": -1552.0, "logps/chosen": -924.0, "logits/rejected": 1.8203125, "logits/chosen": -0.7578125, "nll_loss": 1.46875, "epoch": 0.3, "global_step/max_steps": "15/250", "percentage": "6.00%", "elapsed_time": "44s", "remaining_time": "11m 30s"} +{"loss": 1.17226563, "grad_norm": 2.35101031, "learning_rate": 9.978e-05, "memory(GiB)": 32.02, "train_speed(iter/s)": 0.331194, "rewards/chosen": 4.65625, "rewards/rejected": 3.09375, "rewards/accuracies": 0.80000001, "rewards/margins": 1.5546875, "logps/rejected": -324.0, "logps/chosen": -432.0, "logits/rejected": -0.4375, "logits/chosen": 0.5859375, "nll_loss": 0.6640625, "epoch": 0.4, "global_step/max_steps": "20/250", "percentage": "8.00%", "elapsed_time": "57s", "remaining_time": "10m 59s"} +{"eval_loss": 0.67285156, "eval_runtime": 1.2862, "eval_samples_per_second": 3.11, "eval_steps_per_second": 0.778, "eval_rewards/chosen": 9.375, "eval_rewards/rejected": 5.8125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 3.5625, "eval_logps/rejected": -182.0, "eval_logps/chosen": -14.3125, "eval_logits/rejected": 0.86328125, "eval_logits/chosen": -1.3125, "eval_nll_loss": 0.62109375, "epoch": 0.4, "global_step/max_steps": "20/250", "percentage": "8.00%", "elapsed_time": "58s", "remaining_time": "11m 14s"} +{"loss": 0.80476074, "grad_norm": 0.59843074, "learning_rate": 9.937e-05, "memory(GiB)": 32.02, "train_speed(iter/s)": 0.330795, "rewards/chosen": 9.125, "rewards/rejected": 3.6875, "rewards/accuracies": 1.0, "rewards/margins": 5.40625, "logps/rejected": -420.0, "logps/chosen": -326.0, "logits/rejected": -0.0390625, "logits/chosen": 0.25585938, "nll_loss": 0.5078125, "epoch": 0.5, "global_step/max_steps": "25/250", "percentage": "10.00%", "elapsed_time": "1m 12s", "remaining_time": "10m 53s"} +{"loss": 0.69016113, "grad_norm": 0.52237341, "learning_rate": 9.874e-05, "memory(GiB)": 32.02, "train_speed(iter/s)": 0.339407, "rewards/chosen": 10.5625, "rewards/rejected": 3.765625, "rewards/accuracies": 1.0, "rewards/margins": 6.78125, "logps/rejected": -656.0, "logps/chosen": -400.0, "logits/rejected": 1.546875, "logits/chosen": -0.21875, "nll_loss": 0.78125, "epoch": 0.6, "global_step/max_steps": "30/250", "percentage": "12.00%", "elapsed_time": "1m 25s", "remaining_time": "10m 26s"} +{"loss": 0.70554199, "grad_norm": 1.08391863, "learning_rate": 9.789e-05, "memory(GiB)": 32.02, "train_speed(iter/s)": 0.346622, "rewards/chosen": 12.75, "rewards/rejected": 3.125, "rewards/accuracies": 1.0, "rewards/margins": 9.625, "logps/rejected": -322.0, "logps/chosen": -592.0, "logits/rejected": -1.578125, "logits/chosen": 1.5625, "nll_loss": 0.96875, "epoch": 0.7, "global_step/max_steps": "35/250", "percentage": "14.00%", "elapsed_time": "1m 37s", "remaining_time": "10m 1s"} +{"loss": 0.63630371, "grad_norm": 13.0255393, "learning_rate": 9.683e-05, "memory(GiB)": 32.02, "train_speed(iter/s)": 0.345794, "rewards/chosen": 11.0625, "rewards/rejected": 3.546875, "rewards/accuracies": 1.0, "rewards/margins": 7.53125, "logps/rejected": -322.0, "logps/chosen": -225.0, "logits/rejected": 0.71484375, "logits/chosen": 0.08740234, "nll_loss": 0.56640625, "epoch": 0.8, "global_step/max_steps": "40/250", "percentage": "16.00%", "elapsed_time": "1m 52s", "remaining_time": "9m 51s"} +{"eval_loss": 0.52832031, "eval_runtime": 1.2905, "eval_samples_per_second": 3.1, "eval_steps_per_second": 0.775, "eval_rewards/chosen": 10.125, "eval_rewards/rejected": 2.796875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 7.3125, "eval_logps/rejected": -212.0, "eval_logps/chosen": -7.1875, "eval_logits/rejected": 0.9375, "eval_logits/chosen": -0.8046875, "eval_nll_loss": 0.3125, "epoch": 0.8, "global_step/max_steps": "40/250", "percentage": "16.00%", "elapsed_time": "1m 53s", "remaining_time": "9m 58s"} +{"loss": 0.54377441, "grad_norm": 0.49724109, "learning_rate": 9.557e-05, "memory(GiB)": 32.02, "train_speed(iter/s)": 0.342553, "rewards/chosen": 14.0625, "rewards/rejected": 3.390625, "rewards/accuracies": 1.0, "rewards/margins": 10.6875, "logps/rejected": -430.0, "logps/chosen": -426.0, "logits/rejected": 0.00234985, "logits/chosen": 2.890625, "nll_loss": 0.47070312, "epoch": 0.9, "global_step/max_steps": "45/250", "percentage": "18.00%", "elapsed_time": "2m 8s", "remaining_time": "9m 44s"} +{"loss": 0.57803955, "grad_norm": 1.4155859, "learning_rate": 9.411e-05, "memory(GiB)": 32.02, "train_speed(iter/s)": 0.346029, "rewards/chosen": 16.25, "rewards/rejected": 7.125, "rewards/accuracies": 1.0, "rewards/margins": 9.125, "logps/rejected": -1480.0, "logps/chosen": -804.0, "logits/rejected": 1.75, "logits/chosen": 0.24511719, "nll_loss": 0.6953125, "epoch": 1.0, "global_step/max_steps": "50/250", "percentage": "20.00%", "elapsed_time": "2m 21s", "remaining_time": "9m 25s"} +{"loss": 0.61386719, "grad_norm": 0.56942757, "learning_rate": 9.245e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.347089, "rewards/chosen": 15.75, "rewards/rejected": 3.53125, "rewards/accuracies": 1.0, "rewards/margins": 12.25, "logps/rejected": -612.0, "logps/chosen": -386.0, "logits/rejected": 0.703125, "logits/chosen": 1.8359375, "nll_loss": 0.546875, "epoch": 1.1, "global_step/max_steps": "55/250", "percentage": "22.00%", "elapsed_time": "2m 35s", "remaining_time": "9m 11s"} +{"loss": 0.57956543, "grad_norm": 0.34724867, "learning_rate": 9.061e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.349928, "rewards/chosen": 15.375, "rewards/rejected": 3.140625, "rewards/accuracies": 1.0, "rewards/margins": 12.1875, "logps/rejected": -520.0, "logps/chosen": -233.0, "logits/rejected": 2.15625, "logits/chosen": 1.65625, "nll_loss": 0.39648438, "epoch": 1.2, "global_step/max_steps": "60/250", "percentage": "24.00%", "elapsed_time": "2m 48s", "remaining_time": "8m 53s"} +{"eval_loss": 0.51806641, "eval_runtime": 1.2918, "eval_samples_per_second": 3.096, "eval_steps_per_second": 0.774, "eval_rewards/chosen": 10.125, "eval_rewards/rejected": 1.703125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 8.4375, "eval_logps/rejected": -223.0, "eval_logps/chosen": -7.3125, "eval_logits/rejected": 1.703125, "eval_logits/chosen": -0.23828125, "eval_nll_loss": 0.31640625, "epoch": 1.2, "global_step/max_steps": "60/250", "percentage": "24.00%", "elapsed_time": "2m 49s", "remaining_time": "8m 57s"} +{"loss": 0.61933594, "grad_norm": 0.4883082, "learning_rate": 8.858e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.34619, "rewards/chosen": 15.5625, "rewards/rejected": 3.703125, "rewards/accuracies": 1.0, "rewards/margins": 11.875, "logps/rejected": -844.0, "logps/chosen": -346.0, "logits/rejected": 1.9921875, "logits/chosen": 0.97265625, "nll_loss": 0.640625, "epoch": 1.3, "global_step/max_steps": "65/250", "percentage": "26.00%", "elapsed_time": "3m 4s", "remaining_time": "8m 45s"} +{"loss": 0.56813965, "grad_norm": 0.34610013, "learning_rate": 8.639e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.349389, "rewards/chosen": 11.5625, "rewards/rejected": 2.484375, "rewards/accuracies": 0.80000001, "rewards/margins": 9.125, "logps/rejected": -1120.0, "logps/chosen": -88.0, "logits/rejected": 2.28125, "logits/chosen": -0.6875, "nll_loss": 0.84375, "epoch": 1.4, "global_step/max_steps": "70/250", "percentage": "28.00%", "elapsed_time": "3m 17s", "remaining_time": "8m 27s"} +{"loss": 0.51720581, "grad_norm": 0.49738865, "learning_rate": 8.404e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.351345, "rewards/chosen": 18.625, "rewards/rejected": 2.28125, "rewards/accuracies": 1.0, "rewards/margins": 16.375, "logps/rejected": -434.0, "logps/chosen": -462.0, "logits/rejected": 0.26367188, "logits/chosen": 2.515625, "nll_loss": 0.46875, "epoch": 1.5, "global_step/max_steps": "75/250", "percentage": "30.00%", "elapsed_time": "3m 30s", "remaining_time": "8m 11s"} +{"loss": 0.56296387, "grad_norm": 0.34985274, "learning_rate": 8.154e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.353922, "rewards/chosen": 21.5, "rewards/rejected": 5.84375, "rewards/accuracies": 1.0, "rewards/margins": 15.6875, "logps/rejected": -132.0, "logps/chosen": -580.0, "logits/rejected": -1.296875, "logits/chosen": 3.5625, "nll_loss": 0.703125, "epoch": 1.6, "global_step/max_steps": "80/250", "percentage": "32.00%", "elapsed_time": "3m 43s", "remaining_time": "7m 53s"} +{"eval_loss": 0.50097656, "eval_runtime": 1.3254, "eval_samples_per_second": 3.018, "eval_steps_per_second": 0.754, "eval_rewards/chosen": 10.1875, "eval_rewards/rejected": -6.59375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 16.75, "eval_logps/rejected": -306.0, "eval_logps/chosen": -6.3125, "eval_logits/rejected": 0.3125, "eval_logits/chosen": -0.68359375, "eval_nll_loss": 0.2734375, "epoch": 1.6, "global_step/max_steps": "80/250", "percentage": "32.00%", "elapsed_time": "3m 44s", "remaining_time": "7m 56s"} +{"loss": 0.70977783, "grad_norm": 0.66498772, "learning_rate": 7.89e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.351477, "rewards/chosen": 15.5, "rewards/rejected": -2.859375, "rewards/accuracies": 1.0, "rewards/margins": 18.375, "logps/rejected": -768.0, "logps/chosen": -326.0, "logits/rejected": 0.38867188, "logits/chosen": 0.72265625, "nll_loss": 0.45898438, "epoch": 1.7, "global_step/max_steps": "85/250", "percentage": "34.00%", "elapsed_time": "3m 58s", "remaining_time": "7m 43s"} +{"loss": 0.54343262, "grad_norm": 0.57743911, "learning_rate": 7.614e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.351018, "rewards/chosen": 14.75, "rewards/rejected": 0.5390625, "rewards/accuracies": 1.0, "rewards/margins": 14.25, "logps/rejected": -520.0, "logps/chosen": -290.0, "logits/rejected": 1.515625, "logits/chosen": 2.15625, "nll_loss": 0.56640625, "epoch": 1.8, "global_step/max_steps": "90/250", "percentage": "36.00%", "elapsed_time": "4m 13s", "remaining_time": "7m 30s"} +{"loss": 0.60682373, "grad_norm": 0.46335079, "learning_rate": 7.326e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.351082, "rewards/chosen": 16.75, "rewards/rejected": 0.7890625, "rewards/accuracies": 1.0, "rewards/margins": 15.9375, "logps/rejected": -620.0, "logps/chosen": -366.0, "logits/rejected": 1.1640625, "logits/chosen": 1.65625, "nll_loss": 0.5, "epoch": 1.9, "global_step/max_steps": "95/250", "percentage": "38.00%", "elapsed_time": "4m 27s", "remaining_time": "7m 16s"} +{"loss": 0.51856689, "grad_norm": 0.44055184, "learning_rate": 7.028e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.350869, "rewards/chosen": 15.6875, "rewards/rejected": 1.65625, "rewards/accuracies": 1.0, "rewards/margins": 14.0, "logps/rejected": -360.0, "logps/chosen": -276.0, "logits/rejected": 0.10253906, "logits/chosen": 0.59765625, "nll_loss": 0.47070312, "epoch": 2.0, "global_step/max_steps": "100/250", "percentage": "40.00%", "elapsed_time": "4m 42s", "remaining_time": "7m 3s"} +{"eval_loss": 0.50390625, "eval_runtime": 1.3254, "eval_samples_per_second": 3.018, "eval_steps_per_second": 0.754, "eval_rewards/chosen": 10.125, "eval_rewards/rejected": -4.0, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 14.125, "eval_logps/rejected": -280.0, "eval_logps/chosen": -6.84375, "eval_logits/rejected": 0.66796875, "eval_logits/chosen": -0.63671875, "eval_nll_loss": 0.29882812, "epoch": 2.0, "global_step/max_steps": "100/250", "percentage": "40.00%", "elapsed_time": "4m 43s", "remaining_time": "7m 4s"} +{"loss": 0.54086914, "grad_norm": 0.4331983, "learning_rate": 6.72e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.34793, "rewards/chosen": 20.375, "rewards/rejected": 4.25, "rewards/accuracies": 1.0, "rewards/margins": 16.125, "logps/rejected": -580.0, "logps/chosen": -442.0, "logits/rejected": 0.18554688, "logits/chosen": 3.5, "nll_loss": 0.60546875, "epoch": 2.1, "global_step/max_steps": "105/250", "percentage": "42.00%", "elapsed_time": "4m 58s", "remaining_time": "6m 52s"} +{"loss": 0.56291504, "grad_norm": 0.50871862, "learning_rate": 6.406e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.34833, "rewards/chosen": 20.875, "rewards/rejected": 6.34375, "rewards/accuracies": 1.0, "rewards/margins": 14.5, "logps/rejected": -1280.0, "logps/chosen": -688.0, "logits/rejected": 1.9375, "logits/chosen": 0.41015625, "nll_loss": 0.69140625, "epoch": 2.2, "global_step/max_steps": "110/250", "percentage": "44.00%", "elapsed_time": "5m 12s", "remaining_time": "6m 38s"} +{"loss": 0.59663086, "grad_norm": 0.30352973, "learning_rate": 6.085e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.349547, "rewards/chosen": 21.0, "rewards/rejected": 5.5, "rewards/accuracies": 1.0, "rewards/margins": 15.4375, "logps/rejected": -704.0, "logps/chosen": -568.0, "logits/rejected": 1.0625, "logits/chosen": 1.546875, "nll_loss": 0.5390625, "epoch": 2.3, "global_step/max_steps": "115/250", "percentage": "46.00%", "elapsed_time": "5m 25s", "remaining_time": "6m 22s"} +{"loss": 0.46305542, "grad_norm": 0.50996805, "learning_rate": 5.759e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.35126, "rewards/chosen": 16.625, "rewards/rejected": 3.484375, "rewards/accuracies": 1.0, "rewards/margins": 13.1875, "logps/rejected": -316.0, "logps/chosen": -368.0, "logits/rejected": 0.00234985, "logits/chosen": 2.40625, "nll_loss": 0.37109375, "epoch": 2.4, "global_step/max_steps": "120/250", "percentage": "48.00%", "elapsed_time": "5m 38s", "remaining_time": "6m 6s"} +{"eval_loss": 0.49755859, "eval_runtime": 1.3007, "eval_samples_per_second": 3.075, "eval_steps_per_second": 0.769, "eval_rewards/chosen": 10.1875, "eval_rewards/rejected": -4.0, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 14.1875, "eval_logps/rejected": -280.0, "eval_logps/chosen": -6.4375, "eval_logits/rejected": 0.41601562, "eval_logits/chosen": -0.96484375, "eval_nll_loss": 0.27929688, "epoch": 2.4, "global_step/max_steps": "120/250", "percentage": "48.00%", "elapsed_time": "5m 39s", "remaining_time": "6m 8s"} +{"loss": 0.53605042, "grad_norm": 0.67592123, "learning_rate": 5.43e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.350267, "rewards/chosen": 11.75, "rewards/rejected": -7.96875, "rewards/accuracies": 1.0, "rewards/margins": 19.75, "logps/rejected": -1288.0, "logps/chosen": -62.5, "logits/rejected": 0.82421875, "logits/chosen": -1.4140625, "nll_loss": 0.7265625, "epoch": 2.5, "global_step/max_steps": "125/250", "percentage": "50.00%", "elapsed_time": "5m 53s", "remaining_time": "5m 53s"} +{"loss": 0.46550293, "grad_norm": 0.36351185, "learning_rate": 5.099e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.35164, "rewards/chosen": 18.875, "rewards/rejected": 3.515625, "rewards/accuracies": 1.0, "rewards/margins": 15.3125, "logps/rejected": -314.0, "logps/chosen": -348.0, "logits/rejected": 0.546875, "logits/chosen": 2.3125, "nll_loss": 0.484375, "epoch": 2.6, "global_step/max_steps": "130/250", "percentage": "52.00%", "elapsed_time": "6m 6s", "remaining_time": "5m 38s"} +{"loss": 0.44596214, "grad_norm": 0.45464234, "learning_rate": 4.768e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.352504, "rewards/chosen": 13.375, "rewards/rejected": -1.5625, "rewards/accuracies": 1.0, "rewards/margins": 14.9375, "logps/rejected": -624.0, "logps/chosen": -157.0, "logits/rejected": 0.88671875, "logits/chosen": -0.375, "nll_loss": 0.24609375, "epoch": 2.7, "global_step/max_steps": "135/250", "percentage": "54.00%", "elapsed_time": "6m 19s", "remaining_time": "5m 23s"} +{"loss": 0.49989929, "grad_norm": 0.37665269, "learning_rate": 4.438e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.353731, "rewards/chosen": 18.625, "rewards/rejected": 1.5546875, "rewards/accuracies": 1.0, "rewards/margins": 17.0, "logps/rejected": -708.0, "logps/chosen": -392.0, "logits/rejected": 1.1484375, "logits/chosen": 1.484375, "nll_loss": 0.734375, "epoch": 2.8, "global_step/max_steps": "140/250", "percentage": "56.00%", "elapsed_time": "6m 32s", "remaining_time": "5m 8s"} +{"eval_loss": 0.49853516, "eval_runtime": 1.2976, "eval_samples_per_second": 3.083, "eval_steps_per_second": 0.771, "eval_rewards/chosen": 10.1875, "eval_rewards/rejected": -4.1875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 14.375, "eval_logps/rejected": -282.0, "eval_logps/chosen": -6.65625, "eval_logits/rejected": 0.28515625, "eval_logits/chosen": -1.0546875, "eval_nll_loss": 0.2890625, "epoch": 2.8, "global_step/max_steps": "140/250", "percentage": "56.00%", "elapsed_time": "6m 34s", "remaining_time": "5m 9s"} +{"loss": 0.55924377, "grad_norm": 0.73684756, "learning_rate": 4.11e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.350922, "rewards/chosen": 21.75, "rewards/rejected": 4.90625, "rewards/accuracies": 1.0, "rewards/margins": 16.875, "logps/rejected": -664.0, "logps/chosen": -624.0, "logits/rejected": 0.7578125, "logits/chosen": 1.484375, "nll_loss": 0.7421875, "epoch": 2.9, "global_step/max_steps": "145/250", "percentage": "58.00%", "elapsed_time": "6m 50s", "remaining_time": "4m 57s"} +{"loss": 0.4158844, "grad_norm": 0.52643837, "learning_rate": 3.786e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.350374, "rewards/chosen": 22.5, "rewards/rejected": 3.5, "rewards/accuracies": 1.0, "rewards/margins": 19.0, "logps/rejected": -310.0, "logps/chosen": -406.0, "logits/rejected": -0.11962891, "logits/chosen": 2.53125, "nll_loss": 0.39453125, "epoch": 3.0, "global_step/max_steps": "150/250", "percentage": "60.00%", "elapsed_time": "7m 5s", "remaining_time": "4m 43s"} +{"loss": 0.53366508, "grad_norm": 0.3832315, "learning_rate": 3.468e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.350973, "rewards/chosen": 19.125, "rewards/rejected": 1.7265625, "rewards/accuracies": 1.0, "rewards/margins": 17.375, "logps/rejected": -588.0, "logps/chosen": -600.0, "logits/rejected": 1.328125, "logits/chosen": 1.1484375, "nll_loss": 0.515625, "epoch": 3.1, "global_step/max_steps": "155/250", "percentage": "62.00%", "elapsed_time": "7m 18s", "remaining_time": "4m 28s"} +{"loss": 0.45610352, "grad_norm": 0.41804293, "learning_rate": 3.156e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.350912, "rewards/chosen": 21.5, "rewards/rejected": 4.71875, "rewards/accuracies": 1.0, "rewards/margins": 16.75, "logps/rejected": -360.0, "logps/chosen": -416.0, "logits/rejected": 0.10693359, "logits/chosen": 2.84375, "nll_loss": 0.38671875, "epoch": 3.2, "global_step/max_steps": "160/250", "percentage": "64.00%", "elapsed_time": "7m 32s", "remaining_time": "4m 14s"} +{"eval_loss": 0.49707031, "eval_runtime": 1.2858, "eval_samples_per_second": 3.111, "eval_steps_per_second": 0.778, "eval_rewards/chosen": 10.1875, "eval_rewards/rejected": -4.59375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 14.75, "eval_logps/rejected": -286.0, "eval_logps/chosen": -6.46875, "eval_logits/rejected": 0.16699219, "eval_logits/chosen": -1.046875, "eval_nll_loss": 0.28125, "epoch": 3.2, "global_step/max_steps": "160/250", "percentage": "64.00%", "elapsed_time": "7m 34s", "remaining_time": "4m 15s"} +{"loss": 0.42852402, "grad_norm": 0.99779868, "learning_rate": 2.852e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.350263, "rewards/chosen": 16.625, "rewards/rejected": -1.7890625, "rewards/accuracies": 1.0, "rewards/margins": 18.5, "logps/rejected": -498.0, "logps/chosen": -145.0, "logits/rejected": 0.234375, "logits/chosen": -0.17382812, "nll_loss": 0.16796875, "epoch": 3.3, "global_step/max_steps": "165/250", "percentage": "66.00%", "elapsed_time": "7m 48s", "remaining_time": "4m 1s"} +{"loss": 0.45649567, "grad_norm": 0.50734806, "learning_rate": 2.558e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.35017, "rewards/chosen": 25.0, "rewards/rejected": 5.875, "rewards/accuracies": 1.0, "rewards/margins": 19.125, "logps/rejected": -207.0, "logps/chosen": -640.0, "logits/rejected": -0.92578125, "logits/chosen": 3.234375, "nll_loss": 0.453125, "epoch": 3.4, "global_step/max_steps": "170/250", "percentage": "68.00%", "elapsed_time": "8m 2s", "remaining_time": "3m 47s"} +{"loss": 0.39521122, "grad_norm": 0.5295453, "learning_rate": 2.274e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.350866, "rewards/chosen": 21.75, "rewards/rejected": 7.03125, "rewards/accuracies": 1.0, "rewards/margins": 14.75, "logps/rejected": -592.0, "logps/chosen": -466.0, "logits/rejected": 0.39257812, "logits/chosen": 0.49804688, "nll_loss": 0.65625, "epoch": 3.5, "global_step/max_steps": "175/250", "percentage": "70.00%", "elapsed_time": "8m 15s", "remaining_time": "3m 32s"} +{"loss": 0.50094604, "grad_norm": 0.30567037, "learning_rate": 2.002e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.351966, "rewards/chosen": 21.25, "rewards/rejected": 4.125, "rewards/accuracies": 1.0, "rewards/margins": 17.125, "logps/rejected": -262.0, "logps/chosen": -482.0, "logits/rejected": -0.09326172, "logits/chosen": 3.09375, "nll_loss": 0.546875, "epoch": 3.6, "global_step/max_steps": "180/250", "percentage": "72.00%", "elapsed_time": "8m 28s", "remaining_time": "3m 17s"} +{"eval_loss": 0.50195312, "eval_runtime": 1.2632, "eval_samples_per_second": 3.167, "eval_steps_per_second": 0.792, "eval_rewards/chosen": 10.125, "eval_rewards/rejected": -4.40625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 14.5, "eval_logps/rejected": -284.0, "eval_logps/chosen": -7.0, "eval_logits/rejected": 0.06054688, "eval_logits/chosen": -1.1640625, "eval_nll_loss": 0.30273438, "epoch": 3.6, "global_step/max_steps": "180/250", "percentage": "72.00%", "elapsed_time": "8m 29s", "remaining_time": "3m 18s"} +{"loss": 0.47723646, "grad_norm": 0.9188395, "learning_rate": 1.744e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.35126, "rewards/chosen": 21.25, "rewards/rejected": 4.40625, "rewards/accuracies": 1.0, "rewards/margins": 16.875, "logps/rejected": -229.0, "logps/chosen": -536.0, "logits/rejected": -0.10009766, "logits/chosen": 3.0, "nll_loss": 0.609375, "epoch": 3.7, "global_step/max_steps": "185/250", "percentage": "74.00%", "elapsed_time": "8m 43s", "remaining_time": "3m 3s"} +{"loss": 0.4629364, "grad_norm": 0.52888017, "learning_rate": 1.5e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.352271, "rewards/chosen": 19.75, "rewards/rejected": 1.34375, "rewards/accuracies": 1.0, "rewards/margins": 18.5, "logps/rejected": -708.0, "logps/chosen": -330.0, "logits/rejected": 1.4140625, "logits/chosen": 0.9765625, "nll_loss": 0.6875, "epoch": 3.8, "global_step/max_steps": "190/250", "percentage": "76.00%", "elapsed_time": "8m 56s", "remaining_time": "2m 49s"} +{"loss": 0.38446922, "grad_norm": 0.59169349, "learning_rate": 1.271e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.352064, "rewards/chosen": 22.75, "rewards/rejected": 2.765625, "rewards/accuracies": 1.0, "rewards/margins": 20.0, "logps/rejected": -680.0, "logps/chosen": -414.0, "logits/rejected": 0.78125, "logits/chosen": -0.31640625, "nll_loss": 0.43554688, "epoch": 3.9, "global_step/max_steps": "195/250", "percentage": "78.00%", "elapsed_time": "9m 10s", "remaining_time": "2m 35s"} +{"loss": 0.43031769, "grad_norm": 0.65134231, "learning_rate": 1.059e-05, "memory(GiB)": 56.51, "train_speed(iter/s)": 0.351751, "rewards/chosen": 21.875, "rewards/rejected": 3.78125, "rewards/accuracies": 1.0, "rewards/margins": 18.125, "logps/rejected": -728.0, "logps/chosen": -414.0, "logits/rejected": 0.53125, "logits/chosen": 1.2578125, "nll_loss": 0.48046875, "epoch": 4.0, "global_step/max_steps": "200/250", "percentage": "80.00%", "elapsed_time": "9m 25s", "remaining_time": "2m 21s"} +{"eval_loss": 0.50976562, "eval_runtime": 1.2468, "eval_samples_per_second": 3.208, "eval_steps_per_second": 0.802, "eval_rewards/chosen": 10.125, "eval_rewards/rejected": -4.59375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 14.75, "eval_logps/rejected": -286.0, "eval_logps/chosen": -7.5625, "eval_logits/rejected": 0.06347656, "eval_logits/chosen": -1.265625, "eval_nll_loss": 0.328125, "epoch": 4.0, "global_step/max_steps": "200/250", "percentage": "80.00%", "elapsed_time": "9m 26s", "remaining_time": "2m 21s"} +{"loss": 0.4748867, "grad_norm": 0.46757623, "learning_rate": 8.63e-06, "memory(GiB)": 56.51, "train_speed(iter/s)": 0.35013, "rewards/chosen": 21.5, "rewards/rejected": 5.40625, "rewards/accuracies": 1.0, "rewards/margins": 16.125, "logps/rejected": -704.0, "logps/chosen": -428.0, "logits/rejected": 1.0546875, "logits/chosen": 0.8984375, "nll_loss": 0.4375, "epoch": 4.1, "global_step/max_steps": "205/250", "percentage": "82.00%", "elapsed_time": "9m 42s", "remaining_time": "2m 7s"} +{"loss": 0.42456894, "grad_norm": 0.55219121, "learning_rate": 6.87e-06, "memory(GiB)": 56.51, "train_speed(iter/s)": 0.350785, "rewards/chosen": 20.125, "rewards/rejected": 3.71875, "rewards/accuracies": 1.0, "rewards/margins": 16.375, "logps/rejected": -230.0, "logps/chosen": -274.0, "logits/rejected": -0.33984375, "logits/chosen": 2.734375, "nll_loss": 0.34375, "epoch": 4.2, "global_step/max_steps": "210/250", "percentage": "84.00%", "elapsed_time": "9m 55s", "remaining_time": "1m 53s"} +{"loss": 0.42719383, "grad_norm": 0.61232576, "learning_rate": 5.29e-06, "memory(GiB)": 56.51, "train_speed(iter/s)": 0.35093, "rewards/chosen": 19.0, "rewards/rejected": 0.8046875, "rewards/accuracies": 1.0, "rewards/margins": 18.25, "logps/rejected": -1216.0, "logps/chosen": -211.0, "logits/rejected": 1.8359375, "logits/chosen": -1.28125, "nll_loss": 0.32617188, "epoch": 4.3, "global_step/max_steps": "215/250", "percentage": "86.00%", "elapsed_time": "10m 9s", "remaining_time": "1m 39s"} +{"loss": 0.3799614, "grad_norm": 0.35292376, "learning_rate": 3.9e-06, "memory(GiB)": 56.51, "train_speed(iter/s)": 0.350899, "rewards/chosen": 16.25, "rewards/rejected": -2.078125, "rewards/accuracies": 1.0, "rewards/margins": 18.375, "logps/rejected": -984.0, "logps/chosen": -135.0, "logits/rejected": 1.640625, "logits/chosen": -1.171875, "nll_loss": 0.23632812, "epoch": 4.4, "global_step/max_steps": "220/250", "percentage": "88.00%", "elapsed_time": "10m 23s", "remaining_time": "1m 25s"} +{"eval_loss": 0.51318359, "eval_runtime": 1.3289, "eval_samples_per_second": 3.01, "eval_steps_per_second": 0.753, "eval_rewards/chosen": 10.0625, "eval_rewards/rejected": -5.0, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 15.0625, "eval_logps/rejected": -290.0, "eval_logps/chosen": -7.9375, "eval_logits/rejected": 0.03662109, "eval_logits/chosen": -1.3125, "eval_nll_loss": 0.34570312, "epoch": 4.4, "global_step/max_steps": "220/250", "percentage": "88.00%", "elapsed_time": "10m 25s", "remaining_time": "1m 25s"} +{"loss": 0.42312088, "grad_norm": 0.41222962, "learning_rate": 2.72e-06, "memory(GiB)": 56.51, "train_speed(iter/s)": 0.350059, "rewards/chosen": 25.875, "rewards/rejected": 6.09375, "rewards/accuracies": 1.0, "rewards/margins": 19.75, "logps/rejected": -280.0, "logps/chosen": -696.0, "logits/rejected": -0.31835938, "logits/chosen": 4.0, "nll_loss": 0.70703125, "epoch": 4.5, "global_step/max_steps": "225/250", "percentage": "90.00%", "elapsed_time": "10m 39s", "remaining_time": "1m 11s"} +{"loss": 0.38463011, "grad_norm": 0.44695222, "learning_rate": 1.75e-06, "memory(GiB)": 56.51, "train_speed(iter/s)": 0.35023, "rewards/chosen": 20.5, "rewards/rejected": 0.07177734, "rewards/accuracies": 1.0, "rewards/margins": 20.375, "logps/rejected": -716.0, "logps/chosen": -320.0, "logits/rejected": 0.1640625, "logits/chosen": 0.98828125, "nll_loss": 0.41992188, "epoch": 4.6, "global_step/max_steps": "230/250", "percentage": "92.00%", "elapsed_time": "10m 53s", "remaining_time": "56s"} +{"loss": 0.37644997, "grad_norm": 0.71520247, "learning_rate": 9.9e-07, "memory(GiB)": 56.51, "train_speed(iter/s)": 0.350576, "rewards/chosen": 17.25, "rewards/rejected": -1.71875, "rewards/accuracies": 1.0, "rewards/margins": 18.875, "logps/rejected": -712.0, "logps/chosen": -221.0, "logits/rejected": 1.296875, "logits/chosen": 0.10253906, "nll_loss": 0.3046875, "epoch": 4.7, "global_step/max_steps": "235/250", "percentage": "94.00%", "elapsed_time": "11m 7s", "remaining_time": "42s"} +{"loss": 0.42983589, "grad_norm": 0.32906939, "learning_rate": 4.4e-07, "memory(GiB)": 56.51, "train_speed(iter/s)": 0.351336, "rewards/chosen": 27.0, "rewards/rejected": 6.5, "rewards/accuracies": 1.0, "rewards/margins": 20.5, "logps/rejected": -828.0, "logps/chosen": -556.0, "logits/rejected": -0.41601562, "logits/chosen": 0.9375, "nll_loss": 0.8515625, "epoch": 4.8, "global_step/max_steps": "240/250", "percentage": "96.00%", "elapsed_time": "11m 20s", "remaining_time": "28s"} +{"eval_loss": 0.51806641, "eval_runtime": 1.2957, "eval_samples_per_second": 3.087, "eval_steps_per_second": 0.772, "eval_rewards/chosen": 10.0, "eval_rewards/rejected": -5.1875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 15.1875, "eval_logps/rejected": -292.0, "eval_logps/chosen": -8.3125, "eval_logits/rejected": 0.03710938, "eval_logits/chosen": -1.34375, "eval_nll_loss": 0.36132812, "epoch": 4.8, "global_step/max_steps": "240/250", "percentage": "96.00%", "elapsed_time": "11m 21s", "remaining_time": "28s"} +{"loss": 0.44684172, "grad_norm": 0.46634398, "learning_rate": 1.1e-07, "memory(GiB)": 56.51, "train_speed(iter/s)": 0.350845, "rewards/chosen": 22.75, "rewards/rejected": 2.78125, "rewards/accuracies": 1.0, "rewards/margins": 20.0, "logps/rejected": -624.0, "logps/chosen": -548.0, "logits/rejected": 0.16113281, "logits/chosen": 1.9296875, "nll_loss": 0.57421875, "epoch": 4.9, "global_step/max_steps": "245/250", "percentage": "98.00%", "elapsed_time": "11m 35s", "remaining_time": "14s"} +{"loss": 0.47538567, "grad_norm": 0.32118489, "learning_rate": 0.0, "memory(GiB)": 56.51, "train_speed(iter/s)": 0.351443, "rewards/chosen": 21.875, "rewards/rejected": 1.78125, "rewards/accuracies": 1.0, "rewards/margins": 20.125, "logps/rejected": -464.0, "logps/chosen": -398.0, "logits/rejected": 0.6953125, "logits/chosen": 1.609375, "nll_loss": 0.46679688, "epoch": 5.0, "global_step/max_steps": "250/250", "percentage": "100.00%", "elapsed_time": "11m 48s", "remaining_time": "0s"} +{"eval_loss": 0.51513672, "eval_runtime": 1.2604, "eval_samples_per_second": 3.174, "eval_steps_per_second": 0.793, "eval_rewards/chosen": 10.0625, "eval_rewards/rejected": -5.1875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 15.25, "eval_logps/rejected": -292.0, "eval_logps/chosen": -8.125, "eval_logits/rejected": 0.03417969, "eval_logits/chosen": -1.3515625, "eval_nll_loss": 0.35351562, "epoch": 5.0, "global_step/max_steps": "250/250", "percentage": "100.00%", "elapsed_time": "11m 49s", "remaining_time": "0s"} +{"train_runtime": 710.5808, "train_samples_per_second": 2.779, "train_steps_per_second": 0.352, "total_flos": 219416242290688.0, "train_loss": 0.6236607, "epoch": 5.0, "global_step/max_steps": "250/250", "percentage": "100.00%", "elapsed_time": "11m 50s", "remaining_time": "0s"} +{"train_dataset": "1172.215190±496.010190, min=300.000000, max=4173.000000, size=395", "val_dataset": "1183.750000±508.140421, min=717.000000, max=2024.000000, size=4", "model_parameter_info": "PeftModelForCausalLM: 7635.8016M Params (20.1851M Trainable [0.2643%]), 0.0001M Buffers.", "last_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-250", "best_model_checkpoint": "/home/wangruotong/LLM_test/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/checkpoint-160", "best_metric": 0.49707031, "global_step": 250, "log_history": [{"loss": 2.4697265625, "grad_norm": 9.15866064284304, "learning_rate": 7.692307692307694e-06, "memory(GiB)": 8.4, "train_speed(iter/s)": 0.126463, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/rejected": -142.0, "logps/chosen": -520.0, "logits/rejected": -2.546875, "logits/chosen": 2.390625, "nll_loss": 0.51171875, "epoch": 0.02, "step": 1}, {"loss": 2.3878173828125, "grad_norm": 26.585759281524314, "learning_rate": 3.846153846153846e-05, "memory(GiB)": 12.04, "train_speed(iter/s)": 0.276117, "rewards/chosen": 0.125, "rewards/rejected": 0.150390625, "rewards/accuracies": 0.25, "rewards/margins": -0.0250244140625, "logps/rejected": -304.0, "logps/chosen": -728.0, "logits/rejected": -0.8671875, "logits/chosen": 0.1533203125, "nll_loss": 1.1484375, "epoch": 0.1, "step": 5}, {"loss": 2.18505859375, "grad_norm": 8.664467296577401, "learning_rate": 7.692307692307693e-05, "memory(GiB)": 17.87, "train_speed(iter/s)": 0.313547, "rewards/chosen": 0.1103515625, "rewards/rejected": 0.0201416015625, "rewards/accuracies": 0.6000000238418579, "rewards/margins": 0.08984375, "logps/rejected": -572.0, "logps/chosen": -644.0, "logits/rejected": 1.2890625, "logits/chosen": -0.419921875, "nll_loss": 1.4921875, "epoch": 0.2, "step": 10}, {"loss": 1.76845703125, "grad_norm": 6.325430646213694, "learning_rate": 9.998242976313776e-05, "memory(GiB)": 32.02, "train_speed(iter/s)": 0.318522, "rewards/chosen": 2.75, "rewards/rejected": 0.359375, "rewards/accuracies": 1.0, "rewards/margins": 2.390625, "logps/rejected": -1552.0, "logps/chosen": -924.0, "logits/rejected": 1.8203125, "logits/chosen": -0.7578125, "nll_loss": 1.46875, "epoch": 0.3, "step": 15}, {"loss": 1.172265625, "grad_norm": 2.351010309283972, "learning_rate": 9.97849063861667e-05, "memory(GiB)": 32.02, "train_speed(iter/s)": 0.331194, "rewards/chosen": 4.65625, "rewards/rejected": 3.09375, "rewards/accuracies": 0.800000011920929, "rewards/margins": 1.5546875, "logps/rejected": -324.0, "logps/chosen": -432.0, "logits/rejected": -0.4375, "logits/chosen": 0.5859375, "nll_loss": 0.6640625, "epoch": 0.4, "step": 20}, {"eval_loss": 0.6728515625, "eval_runtime": 1.2862, "eval_samples_per_second": 3.11, "eval_steps_per_second": 0.778, "eval_rewards/chosen": 9.375, "eval_rewards/rejected": 5.8125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 3.5625, "eval_logps/rejected": -182.0, "eval_logps/chosen": -14.3125, "eval_logits/rejected": 0.86328125, "eval_logits/chosen": -1.3125, "eval_nll_loss": 0.62109375, "epoch": 0.4, "step": 20}, {"loss": 0.8047607421875, "grad_norm": 0.5984307422346672, "learning_rate": 9.936876709681668e-05, "memory(GiB)": 32.02, "train_speed(iter/s)": 0.330795, "rewards/chosen": 9.125, "rewards/rejected": 3.6875, "rewards/accuracies": 1.0, "rewards/margins": 5.40625, "logps/rejected": -420.0, "logps/chosen": -326.0, "logits/rejected": -0.0390625, "logits/chosen": 0.255859375, "nll_loss": 0.5078125, "epoch": 0.5, "step": 25}, {"loss": 0.6901611328125, "grad_norm": 0.5223734060204478, "learning_rate": 9.873583924954152e-05, "memory(GiB)": 32.02, "train_speed(iter/s)": 0.339407, "rewards/chosen": 10.5625, "rewards/rejected": 3.765625, "rewards/accuracies": 1.0, "rewards/margins": 6.78125, "logps/rejected": -656.0, "logps/chosen": -400.0, "logits/rejected": 1.546875, "logits/chosen": -0.21875, "nll_loss": 0.78125, "epoch": 0.6, "step": 30}, {"loss": 0.7055419921875, "grad_norm": 1.0839186250481967, "learning_rate": 9.788890216258939e-05, "memory(GiB)": 32.02, "train_speed(iter/s)": 0.346622, "rewards/chosen": 12.75, "rewards/rejected": 3.125, "rewards/accuracies": 1.0, "rewards/margins": 9.625, "logps/rejected": -322.0, "logps/chosen": -592.0, "logits/rejected": -1.578125, "logits/chosen": 1.5625, "nll_loss": 0.96875, "epoch": 0.7, "step": 35}, {"loss": 0.6363037109375, "grad_norm": 13.025539300060226, "learning_rate": 9.68316749134364e-05, "memory(GiB)": 32.02, "train_speed(iter/s)": 0.345794, "rewards/chosen": 11.0625, "rewards/rejected": 3.546875, "rewards/accuracies": 1.0, "rewards/margins": 7.53125, "logps/rejected": -322.0, "logps/chosen": -225.0, "logits/rejected": 0.71484375, "logits/chosen": 0.08740234375, "nll_loss": 0.56640625, "epoch": 0.8, "step": 40}, {"eval_loss": 0.5283203125, "eval_runtime": 1.2905, "eval_samples_per_second": 3.1, "eval_steps_per_second": 0.775, "eval_rewards/chosen": 10.125, "eval_rewards/rejected": 2.796875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 7.3125, "eval_logps/rejected": -212.0, "eval_logps/chosen": -7.1875, "eval_logits/rejected": 0.9375, "eval_logits/chosen": -0.8046875, "eval_nll_loss": 0.3125, "epoch": 0.8, "step": 40}, {"loss": 0.5437744140625, "grad_norm": 0.49724108967226127, "learning_rate": 9.55688000075414e-05, "memory(GiB)": 32.02, "train_speed(iter/s)": 0.342553, "rewards/chosen": 14.0625, "rewards/rejected": 3.390625, "rewards/accuracies": 1.0, "rewards/margins": 10.6875, "logps/rejected": -430.0, "logps/chosen": -426.0, "logits/rejected": 0.002349853515625, "logits/chosen": 2.890625, "nll_loss": 0.470703125, "epoch": 0.9, "step": 45}, {"loss": 0.57803955078125, "grad_norm": 1.4155859018634052, "learning_rate": 9.410582299213573e-05, "memory(GiB)": 32.02, "train_speed(iter/s)": 0.346029, "rewards/chosen": 16.25, "rewards/rejected": 7.125, "rewards/accuracies": 1.0, "rewards/margins": 9.125, "logps/rejected": -1480.0, "logps/chosen": -804.0, "logits/rejected": 1.75, "logits/chosen": 0.2451171875, "nll_loss": 0.6953125, "epoch": 1.0, "step": 50}, {"loss": 0.6138671875, "grad_norm": 0.5694275670231188, "learning_rate": 9.244916810456821e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.347089, "rewards/chosen": 15.75, "rewards/rejected": 3.53125, "rewards/accuracies": 1.0, "rewards/margins": 12.25, "logps/rejected": -612.0, "logps/chosen": -386.0, "logits/rejected": 0.703125, "logits/chosen": 1.8359375, "nll_loss": 0.546875, "epoch": 1.1, "step": 55}, {"loss": 0.5795654296875, "grad_norm": 0.3472486708219598, "learning_rate": 9.060611006213832e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.349928, "rewards/chosen": 15.375, "rewards/rejected": 3.140625, "rewards/accuracies": 1.0, "rewards/margins": 12.1875, "logps/rejected": -520.0, "logps/chosen": -233.0, "logits/rejected": 2.15625, "logits/chosen": 1.65625, "nll_loss": 0.396484375, "epoch": 1.2, "step": 60}, {"eval_loss": 0.51806640625, "eval_runtime": 1.2918, "eval_samples_per_second": 3.096, "eval_steps_per_second": 0.774, "eval_rewards/chosen": 10.125, "eval_rewards/rejected": 1.703125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 8.4375, "eval_logps/rejected": -223.0, "eval_logps/chosen": -7.3125, "eval_logits/rejected": 1.703125, "eval_logits/chosen": -0.23828125, "eval_nll_loss": 0.31640625, "epoch": 1.2, "step": 60}, {"loss": 0.6193359375, "grad_norm": 0.48830819519223906, "learning_rate": 8.858474211729469e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.34619, "rewards/chosen": 15.5625, "rewards/rejected": 3.703125, "rewards/accuracies": 1.0, "rewards/margins": 11.875, "logps/rejected": -844.0, "logps/chosen": -346.0, "logits/rejected": 1.9921875, "logits/chosen": 0.97265625, "nll_loss": 0.640625, "epoch": 1.3, "step": 65}, {"loss": 0.5681396484375, "grad_norm": 0.3461001279478362, "learning_rate": 8.639394051847472e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.349389, "rewards/chosen": 11.5625, "rewards/rejected": 2.484375, "rewards/accuracies": 0.800000011920929, "rewards/margins": 9.125, "logps/rejected": -1120.0, "logps/chosen": -88.0, "logits/rejected": 2.28125, "logits/chosen": -0.6875, "nll_loss": 0.84375, "epoch": 1.4, "step": 70}, {"loss": 0.517205810546875, "grad_norm": 0.49738864814975586, "learning_rate": 8.404332553264547e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.351345, "rewards/chosen": 18.625, "rewards/rejected": 2.28125, "rewards/accuracies": 1.0, "rewards/margins": 16.375, "logps/rejected": -434.0, "logps/chosen": -462.0, "logits/rejected": 0.263671875, "logits/chosen": 2.515625, "nll_loss": 0.46875, "epoch": 1.5, "step": 75}, {"loss": 0.5629638671875, "grad_norm": 0.3498527388541882, "learning_rate": 8.154321920070414e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.353922, "rewards/chosen": 21.5, "rewards/rejected": 5.84375, "rewards/accuracies": 1.0, "rewards/margins": 15.6875, "logps/rejected": -132.0, "logps/chosen": -580.0, "logits/rejected": -1.296875, "logits/chosen": 3.5625, "nll_loss": 0.703125, "epoch": 1.6, "step": 80}, {"eval_loss": 0.5009765625, "eval_runtime": 1.3254, "eval_samples_per_second": 3.018, "eval_steps_per_second": 0.754, "eval_rewards/chosen": 10.1875, "eval_rewards/rejected": -6.59375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 16.75, "eval_logps/rejected": -306.0, "eval_logps/chosen": -6.3125, "eval_logits/rejected": 0.3125, "eval_logits/chosen": -0.68359375, "eval_nll_loss": 0.2734375, "epoch": 1.6, "step": 80}, {"loss": 0.70977783203125, "grad_norm": 0.6649877164961882, "learning_rate": 7.890460001124242e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.351477, "rewards/chosen": 15.5, "rewards/rejected": -2.859375, "rewards/accuracies": 1.0, "rewards/margins": 18.375, "logps/rejected": -768.0, "logps/chosen": -326.0, "logits/rejected": 0.388671875, "logits/chosen": 0.72265625, "nll_loss": 0.458984375, "epoch": 1.7, "step": 85}, {"loss": 0.5434326171875, "grad_norm": 0.5774391096810936, "learning_rate": 7.613905469171246e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.351018, "rewards/chosen": 14.75, "rewards/rejected": 0.5390625, "rewards/accuracies": 1.0, "rewards/margins": 14.25, "logps/rejected": -520.0, "logps/chosen": -290.0, "logits/rejected": 1.515625, "logits/chosen": 2.15625, "nll_loss": 0.56640625, "epoch": 1.8, "step": 90}, {"loss": 0.60682373046875, "grad_norm": 0.46335079012099145, "learning_rate": 7.325872732868869e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.351082, "rewards/chosen": 16.75, "rewards/rejected": 0.7890625, "rewards/accuracies": 1.0, "rewards/margins": 15.9375, "logps/rejected": -620.0, "logps/chosen": -366.0, "logits/rejected": 1.1640625, "logits/chosen": 1.65625, "nll_loss": 0.5, "epoch": 1.9, "step": 95}, {"loss": 0.51856689453125, "grad_norm": 0.44055183589508673, "learning_rate": 7.027626604064969e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.350869, "rewards/chosen": 15.6875, "rewards/rejected": 1.65625, "rewards/accuracies": 1.0, "rewards/margins": 14.0, "logps/rejected": -360.0, "logps/chosen": -276.0, "logits/rejected": 0.1025390625, "logits/chosen": 0.59765625, "nll_loss": 0.470703125, "epoch": 2.0, "step": 100}, {"eval_loss": 0.50390625, "eval_runtime": 1.3254, "eval_samples_per_second": 3.018, "eval_steps_per_second": 0.754, "eval_rewards/chosen": 10.125, "eval_rewards/rejected": -4.0, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 14.125, "eval_logps/rejected": -280.0, "eval_logps/chosen": -6.84375, "eval_logits/rejected": 0.66796875, "eval_logits/chosen": -0.63671875, "eval_nll_loss": 0.298828125, "epoch": 2.0, "step": 100}, {"loss": 0.540869140625, "grad_norm": 0.43319830084665456, "learning_rate": 6.720476743745072e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.34793, "rewards/chosen": 20.375, "rewards/rejected": 4.25, "rewards/accuracies": 1.0, "rewards/margins": 16.125, "logps/rejected": -580.0, "logps/chosen": -442.0, "logits/rejected": 0.185546875, "logits/chosen": 3.5, "nll_loss": 0.60546875, "epoch": 2.1, "step": 105}, {"loss": 0.5629150390625, "grad_norm": 0.5087186205209226, "learning_rate": 6.405771911037699e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.34833, "rewards/chosen": 20.875, "rewards/rejected": 6.34375, "rewards/accuracies": 1.0, "rewards/margins": 14.5, "logps/rejected": -1280.0, "logps/chosen": -688.0, "logits/rejected": 1.9375, "logits/chosen": 0.41015625, "nll_loss": 0.69140625, "epoch": 2.2, "step": 110}, {"loss": 0.596630859375, "grad_norm": 0.30352973245628134, "learning_rate": 6.08489404053159e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.349547, "rewards/chosen": 21.0, "rewards/rejected": 5.5, "rewards/accuracies": 1.0, "rewards/margins": 15.4375, "logps/rejected": -704.0, "logps/chosen": -568.0, "logits/rejected": 1.0625, "logits/chosen": 1.546875, "nll_loss": 0.5390625, "epoch": 2.3, "step": 115}, {"loss": 0.463055419921875, "grad_norm": 0.5099680534446042, "learning_rate": 5.7592521739125726e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.35126, "rewards/chosen": 16.625, "rewards/rejected": 3.484375, "rewards/accuracies": 1.0, "rewards/margins": 13.1875, "logps/rejected": -316.0, "logps/chosen": -368.0, "logits/rejected": 0.002349853515625, "logits/chosen": 2.40625, "nll_loss": 0.37109375, "epoch": 2.4, "step": 120}, {"eval_loss": 0.49755859375, "eval_runtime": 1.3007, "eval_samples_per_second": 3.075, "eval_steps_per_second": 0.769, "eval_rewards/chosen": 10.1875, "eval_rewards/rejected": -4.0, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 14.1875, "eval_logps/rejected": -280.0, "eval_logps/chosen": -6.4375, "eval_logits/rejected": 0.416015625, "eval_logits/chosen": -0.96484375, "eval_nll_loss": 0.279296875, "epoch": 2.4, "step": 120}, {"loss": 0.5360504150390625, "grad_norm": 0.6759212291558464, "learning_rate": 5.430276272567485e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.350267, "rewards/chosen": 11.75, "rewards/rejected": -7.96875, "rewards/accuracies": 1.0, "rewards/margins": 19.75, "logps/rejected": -1288.0, "logps/chosen": -62.5, "logits/rejected": 0.82421875, "logits/chosen": -1.4140625, "nll_loss": 0.7265625, "epoch": 2.5, "step": 125}, {"loss": 0.4655029296875, "grad_norm": 0.3635118516286732, "learning_rate": 5.0994109383253506e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.35164, "rewards/chosen": 18.875, "rewards/rejected": 3.515625, "rewards/accuracies": 1.0, "rewards/margins": 15.3125, "logps/rejected": -314.0, "logps/chosen": -348.0, "logits/rejected": 0.546875, "logits/chosen": 2.3125, "nll_loss": 0.484375, "epoch": 2.6, "step": 130}, {"loss": 0.44596214294433595, "grad_norm": 0.4546423377677983, "learning_rate": 4.768109069909307e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.352504, "rewards/chosen": 13.375, "rewards/rejected": -1.5625, "rewards/accuracies": 1.0, "rewards/margins": 14.9375, "logps/rejected": -624.0, "logps/chosen": -157.0, "logits/rejected": 0.88671875, "logits/chosen": -0.375, "nll_loss": 0.24609375, "epoch": 2.7, "step": 135}, {"loss": 0.4998992919921875, "grad_norm": 0.3766526906078867, "learning_rate": 4.4378254829551396e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.353731, "rewards/chosen": 18.625, "rewards/rejected": 1.5546875, "rewards/accuracies": 1.0, "rewards/margins": 17.0, "logps/rejected": -708.0, "logps/chosen": -392.0, "logits/rejected": 1.1484375, "logits/chosen": 1.484375, "nll_loss": 0.734375, "epoch": 2.8, "step": 140}, {"eval_loss": 0.49853515625, "eval_runtime": 1.2976, "eval_samples_per_second": 3.083, "eval_steps_per_second": 0.771, "eval_rewards/chosen": 10.1875, "eval_rewards/rejected": -4.1875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 14.375, "eval_logps/rejected": -282.0, "eval_logps/chosen": -6.65625, "eval_logits/rejected": 0.28515625, "eval_logits/chosen": -1.0546875, "eval_nll_loss": 0.2890625, "epoch": 2.8, "step": 140}, {"loss": 0.5592437744140625, "grad_norm": 0.7368475635982081, "learning_rate": 4.11001052161225e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.350922, "rewards/chosen": 21.75, "rewards/rejected": 4.90625, "rewards/accuracies": 1.0, "rewards/margins": 16.875, "logps/rejected": -664.0, "logps/chosen": -624.0, "logits/rejected": 0.7578125, "logits/chosen": 1.484375, "nll_loss": 0.7421875, "epoch": 2.9, "step": 145}, {"loss": 0.4158843994140625, "grad_norm": 0.526438371292198, "learning_rate": 3.786103689779861e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.350374, "rewards/chosen": 22.5, "rewards/rejected": 3.5, "rewards/accuracies": 1.0, "rewards/margins": 19.0, "logps/rejected": -310.0, "logps/chosen": -406.0, "logits/rejected": -0.11962890625, "logits/chosen": 2.53125, "nll_loss": 0.39453125, "epoch": 3.0, "step": 150}, {"loss": 0.5336650848388672, "grad_norm": 0.3832315044394523, "learning_rate": 3.467527329945026e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.350973, "rewards/chosen": 19.125, "rewards/rejected": 1.7265625, "rewards/accuracies": 1.0, "rewards/margins": 17.375, "logps/rejected": -588.0, "logps/chosen": -600.0, "logits/rejected": 1.328125, "logits/chosen": 1.1484375, "nll_loss": 0.515625, "epoch": 3.1, "step": 155}, {"loss": 0.456103515625, "grad_norm": 0.41804293149830996, "learning_rate": 3.1556803773799614e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.350912, "rewards/chosen": 21.5, "rewards/rejected": 4.71875, "rewards/accuracies": 1.0, "rewards/margins": 16.75, "logps/rejected": -360.0, "logps/chosen": -416.0, "logits/rejected": 0.10693359375, "logits/chosen": 2.84375, "nll_loss": 0.38671875, "epoch": 3.2, "step": 160}, {"eval_loss": 0.4970703125, "eval_runtime": 1.2858, "eval_samples_per_second": 3.111, "eval_steps_per_second": 0.778, "eval_rewards/chosen": 10.1875, "eval_rewards/rejected": -4.59375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 14.75, "eval_logps/rejected": -286.0, "eval_logps/chosen": -6.46875, "eval_logits/rejected": 0.1669921875, "eval_logits/chosen": -1.046875, "eval_nll_loss": 0.28125, "epoch": 3.2, "step": 160}, {"loss": 0.4285240173339844, "grad_norm": 0.9977986802164632, "learning_rate": 2.8519322171253602e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.350263, "rewards/chosen": 16.625, "rewards/rejected": -1.7890625, "rewards/accuracies": 1.0, "rewards/margins": 18.5, "logps/rejected": -498.0, "logps/chosen": -145.0, "logits/rejected": 0.234375, "logits/chosen": -0.173828125, "nll_loss": 0.16796875, "epoch": 3.3, "step": 165}, {"loss": 0.45649566650390627, "grad_norm": 0.5073480559704879, "learning_rate": 2.5576166707349385e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.35017, "rewards/chosen": 25.0, "rewards/rejected": 5.875, "rewards/accuracies": 1.0, "rewards/margins": 19.125, "logps/rejected": -207.0, "logps/chosen": -640.0, "logits/rejected": -0.92578125, "logits/chosen": 3.234375, "nll_loss": 0.453125, "epoch": 3.4, "step": 170}, {"loss": 0.39521121978759766, "grad_norm": 0.5295452954524565, "learning_rate": 2.2740261391866637e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.350866, "rewards/chosen": 21.75, "rewards/rejected": 7.03125, "rewards/accuracies": 1.0, "rewards/margins": 14.75, "logps/rejected": -592.0, "logps/chosen": -466.0, "logits/rejected": 0.392578125, "logits/chosen": 0.498046875, "nll_loss": 0.65625, "epoch": 3.5, "step": 175}, {"loss": 0.500946044921875, "grad_norm": 0.30567037033016853, "learning_rate": 2.002405927680374e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.351966, "rewards/chosen": 21.25, "rewards/rejected": 4.125, "rewards/accuracies": 1.0, "rewards/margins": 17.125, "logps/rejected": -262.0, "logps/chosen": -482.0, "logits/rejected": -0.09326171875, "logits/chosen": 3.09375, "nll_loss": 0.546875, "epoch": 3.6, "step": 180}, {"eval_loss": 0.501953125, "eval_runtime": 1.2632, "eval_samples_per_second": 3.167, "eval_steps_per_second": 0.792, "eval_rewards/chosen": 10.125, "eval_rewards/rejected": -4.40625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 14.5, "eval_logps/rejected": -284.0, "eval_logps/chosen": -7.0, "eval_logits/rejected": 0.060546875, "eval_logits/chosen": -1.1640625, "eval_nll_loss": 0.302734375, "epoch": 3.6, "step": 180}, {"loss": 0.4772364616394043, "grad_norm": 0.9188395046874438, "learning_rate": 1.743948777242814e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.35126, "rewards/chosen": 21.25, "rewards/rejected": 4.40625, "rewards/accuracies": 1.0, "rewards/margins": 16.875, "logps/rejected": -229.0, "logps/chosen": -536.0, "logits/rejected": -0.10009765625, "logits/chosen": 3.0, "nll_loss": 0.609375, "epoch": 3.7, "step": 185}, {"loss": 0.4629364013671875, "grad_norm": 0.5288801717382393, "learning_rate": 1.4997896271528739e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.352271, "rewards/chosen": 19.75, "rewards/rejected": 1.34375, "rewards/accuracies": 1.0, "rewards/margins": 18.5, "logps/rejected": -708.0, "logps/chosen": -330.0, "logits/rejected": 1.4140625, "logits/chosen": 0.9765625, "nll_loss": 0.6875, "epoch": 3.8, "step": 190}, {"loss": 0.38446922302246095, "grad_norm": 0.5916934882939896, "learning_rate": 1.2710006311864104e-05, "memory(GiB)": 43.66, "train_speed(iter/s)": 0.352064, "rewards/chosen": 22.75, "rewards/rejected": 2.765625, "rewards/accuracies": 1.0, "rewards/margins": 20.0, "logps/rejected": -680.0, "logps/chosen": -414.0, "logits/rejected": 0.78125, "logits/chosen": -0.31640625, "nll_loss": 0.435546875, "epoch": 3.9, "step": 195}, {"loss": 0.43031768798828124, "grad_norm": 0.6513423080203622, "learning_rate": 1.0585864495652897e-05, "memory(GiB)": 56.51, "train_speed(iter/s)": 0.351751, "rewards/chosen": 21.875, "rewards/rejected": 3.78125, "rewards/accuracies": 1.0, "rewards/margins": 18.125, "logps/rejected": -728.0, "logps/chosen": -414.0, "logits/rejected": 0.53125, "logits/chosen": 1.2578125, "nll_loss": 0.48046875, "epoch": 4.0, "step": 200}, {"eval_loss": 0.509765625, "eval_runtime": 1.2468, "eval_samples_per_second": 3.208, "eval_steps_per_second": 0.802, "eval_rewards/chosen": 10.125, "eval_rewards/rejected": -4.59375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 14.75, "eval_logps/rejected": -286.0, "eval_logps/chosen": -7.5625, "eval_logits/rejected": 0.0634765625, "eval_logits/chosen": -1.265625, "eval_nll_loss": 0.328125, "epoch": 4.0, "step": 200}, {"loss": 0.4748867034912109, "grad_norm": 0.46757622724638764, "learning_rate": 8.634798372847148e-06, "memory(GiB)": 56.51, "train_speed(iter/s)": 0.35013, "rewards/chosen": 21.5, "rewards/rejected": 5.40625, "rewards/accuracies": 1.0, "rewards/margins": 16.125, "logps/rejected": -704.0, "logps/chosen": -428.0, "logits/rejected": 1.0546875, "logits/chosen": 0.8984375, "nll_loss": 0.4375, "epoch": 4.1, "step": 205}, {"loss": 0.42456893920898436, "grad_norm": 0.5521912104468201, "learning_rate": 6.865375481914016e-06, "memory(GiB)": 56.51, "train_speed(iter/s)": 0.350785, "rewards/chosen": 20.125, "rewards/rejected": 3.71875, "rewards/accuracies": 1.0, "rewards/margins": 16.375, "logps/rejected": -230.0, "logps/chosen": -274.0, "logits/rejected": -0.33984375, "logits/chosen": 2.734375, "nll_loss": 0.34375, "epoch": 4.2, "step": 210}, {"loss": 0.42719383239746095, "grad_norm": 0.6123257585261611, "learning_rate": 5.285365727986707e-06, "memory(GiB)": 56.51, "train_speed(iter/s)": 0.35093, "rewards/chosen": 19.0, "rewards/rejected": 0.8046875, "rewards/accuracies": 1.0, "rewards/margins": 18.25, "logps/rejected": -1216.0, "logps/chosen": -211.0, "logits/rejected": 1.8359375, "logits/chosen": -1.28125, "nll_loss": 0.326171875, "epoch": 4.3, "step": 215}, {"loss": 0.3799613952636719, "grad_norm": 0.3529237641617113, "learning_rate": 3.901707263589671e-06, "memory(GiB)": 56.51, "train_speed(iter/s)": 0.350899, "rewards/chosen": 16.25, "rewards/rejected": -2.078125, "rewards/accuracies": 1.0, "rewards/margins": 18.375, "logps/rejected": -984.0, "logps/chosen": -135.0, "logits/rejected": 1.640625, "logits/chosen": -1.171875, "nll_loss": 0.236328125, "epoch": 4.4, "step": 220}, {"eval_loss": 0.51318359375, "eval_runtime": 1.3289, "eval_samples_per_second": 3.01, "eval_steps_per_second": 0.753, "eval_rewards/chosen": 10.0625, "eval_rewards/rejected": -5.0, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 15.0625, "eval_logps/rejected": -290.0, "eval_logps/chosen": -7.9375, "eval_logits/rejected": 0.03662109375, "eval_logits/chosen": -1.3125, "eval_nll_loss": 0.345703125, "epoch": 4.4, "step": 220}, {"loss": 0.42312088012695315, "grad_norm": 0.4122296158011087, "learning_rate": 2.7204760217631074e-06, "memory(GiB)": 56.51, "train_speed(iter/s)": 0.350059, "rewards/chosen": 25.875, "rewards/rejected": 6.09375, "rewards/accuracies": 1.0, "rewards/margins": 19.75, "logps/rejected": -280.0, "logps/chosen": -696.0, "logits/rejected": -0.318359375, "logits/chosen": 4.0, "nll_loss": 0.70703125, "epoch": 4.5, "step": 225}, {"loss": 0.3846301078796387, "grad_norm": 0.446952222017286, "learning_rate": 1.7468590353731495e-06, "memory(GiB)": 56.51, "train_speed(iter/s)": 0.35023, "rewards/chosen": 20.5, "rewards/rejected": 0.07177734375, "rewards/accuracies": 1.0, "rewards/margins": 20.375, "logps/rejected": -716.0, "logps/chosen": -320.0, "logits/rejected": 0.1640625, "logits/chosen": 0.98828125, "nll_loss": 0.419921875, "epoch": 4.6, "step": 230}, {"loss": 0.3764499664306641, "grad_norm": 0.7152024714830698, "learning_rate": 9.851316597681958e-07, "memory(GiB)": 56.51, "train_speed(iter/s)": 0.350576, "rewards/chosen": 17.25, "rewards/rejected": -1.71875, "rewards/accuracies": 1.0, "rewards/margins": 18.875, "logps/rejected": -712.0, "logps/chosen": -221.0, "logits/rejected": 1.296875, "logits/chosen": 0.1025390625, "nll_loss": 0.3046875, "epoch": 4.7, "step": 235}, {"loss": 0.4298358917236328, "grad_norm": 0.32906938877953124, "learning_rate": 4.386387988014273e-07, "memory(GiB)": 56.51, "train_speed(iter/s)": 0.351336, "rewards/chosen": 27.0, "rewards/rejected": 6.5, "rewards/accuracies": 1.0, "rewards/margins": 20.5, "logps/rejected": -828.0, "logps/chosen": -556.0, "logits/rejected": -0.416015625, "logits/chosen": 0.9375, "nll_loss": 0.8515625, "epoch": 4.8, "step": 240}, {"eval_loss": 0.51806640625, "eval_runtime": 1.2957, "eval_samples_per_second": 3.087, "eval_steps_per_second": 0.772, "eval_rewards/chosen": 10.0, "eval_rewards/rejected": -5.1875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 15.1875, "eval_logps/rejected": -292.0, "eval_logps/chosen": -8.3125, "eval_logits/rejected": 0.037109375, "eval_logits/chosen": -1.34375, "eval_nll_loss": 0.361328125, "epoch": 4.8, "step": 240}, {"loss": 0.4468417167663574, "grad_norm": 0.4663439808991418, "learning_rate": 1.0978021666005478e-07, "memory(GiB)": 56.51, "train_speed(iter/s)": 0.350845, "rewards/chosen": 22.75, "rewards/rejected": 2.78125, "rewards/accuracies": 1.0, "rewards/margins": 20.0, "logps/rejected": -624.0, "logps/chosen": -548.0, "logits/rejected": 0.1611328125, "logits/chosen": 1.9296875, "nll_loss": 0.57421875, "epoch": 4.9, "step": 245}, {"loss": 0.4753856658935547, "grad_norm": 0.3211848914019023, "learning_rate": 0.0, "memory(GiB)": 56.51, "train_speed(iter/s)": 0.351443, "rewards/chosen": 21.875, "rewards/rejected": 1.78125, "rewards/accuracies": 1.0, "rewards/margins": 20.125, "logps/rejected": -464.0, "logps/chosen": -398.0, "logits/rejected": 0.6953125, "logits/chosen": 1.609375, "nll_loss": 0.466796875, "epoch": 5.0, "step": 250}, {"eval_loss": 0.51513671875, "eval_runtime": 1.2604, "eval_samples_per_second": 3.174, "eval_steps_per_second": 0.793, "eval_rewards/chosen": 10.0625, "eval_rewards/rejected": -5.1875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 15.25, "eval_logps/rejected": -292.0, "eval_logps/chosen": -8.125, "eval_logits/rejected": 0.0341796875, "eval_logits/chosen": -1.3515625, "eval_nll_loss": 0.353515625, "epoch": 5.0, "step": 250}, {"train_runtime": 710.5808, "train_samples_per_second": 2.779, "train_steps_per_second": 0.352, "total_flos": 219416242290688.0, "train_loss": 0.6236606960296631, "epoch": 5.0, "step": 250}], "memory": 56.5078125} diff --git a/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/runs/events.out.tfevents.1737938805.kml-task-547024-record-9975763-prod-worker-0.103275.0 b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/runs/events.out.tfevents.1737938805.kml-task-547024-record-9975763-prod-worker-0.103275.0 new file mode 100644 index 0000000000000000000000000000000000000000..26b4712bb92fff2000f0bc82bbd116788e252c60 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4200_rank8_epoch5_random20/v1-20250127-004541/runs/events.out.tfevents.1737938805.kml-task-547024-record-9975763-prod-worker-0.103275.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7477f97c9c9172ec8faffced07a8b149f5a0f0786d8f805e9d78de6acc8b59d +size 61688