{ | |
"base_model_id": "deepseek-ai/deepseek-coder-1.3b-instruct", | |
"quantitize": "fp16", | |
"dataset": "Arithmetic_Hard", | |
"data_collator": "DataCollatorForLanguageModeling", | |
"peft_config": { | |
"lora": { | |
"r": 32, | |
"lora_alpha": 64, | |
"target_modules": [ | |
"q_proj", | |
"k_proj", | |
"v_proj", | |
"o_proj", | |
"gate_proj", | |
"up_proj", | |
"down_proj" | |
], | |
"bias": "none", | |
"lora_dropout": 0.05, | |
"task_type": "CAUSAL_LM" | |
}, | |
"lora_large": { | |
"r": 128, | |
"lora_alpha": 256, | |
"target_modules": [ | |
"q_proj", | |
"k_proj", | |
"v_proj", | |
"o_proj", | |
"gate_proj", | |
"up_proj", | |
"down_proj" | |
], | |
"bias": "none", | |
"lora_dropout": 0.05, | |
"task_type": "CAUSAL_LM" | |
}, | |
"p_tuning": { | |
"num_virtual_tokens": 16, | |
"num_transformer_submodules": 1, | |
"token_dim": 2048, | |
"encoder_hidden_size": 2048, | |
"task_type": "CAUSAL_LM" | |
} | |
}, | |
"training_args": { | |
"warmup_steps": 500, | |
"per_device_train_batch_size": 4, | |
"per_device_eval_batch_size": 4, | |
"gradient_accumulation_steps": 1, | |
"max_steps": 30000, | |
"learning_rate": 0.0001, | |
"optim": "adamw_torch", | |
"logging_steps": 100, | |
"save_strategy": "steps", | |
"save_steps": 2000, | |
"evaluation_strategy": "steps", | |
"eval_steps": 1000, | |
"weight_decay": 0.01, | |
"report_to": "wandb", | |
"fp16": true, | |
"output_dir": "runs/deepseek-curri-5", | |
"logging_dir": "runs/deepseek-curri-5/logs" | |
}, | |
"tokenizer": { | |
"tokenize_config": { | |
"truncation": true, | |
"max_length": 512, | |
"padding": "max_length" | |
}, | |
"prompt_template": "config/qa_template.txt" | |
} | |
} | |