Aswanth-Azma commited on
Commit
a9fcdd5
·
verified ·
1 Parent(s): 0865866

Upload training_config.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. training_config.json +93 -0
training_config.json ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "project_name": "azma-hermes-pro-llama-3-8b-030524",
3
+ "new_model_id": "azma-hermes-pro-llama-3-8b-030524",
4
+ "base_model_id": "NousResearch/Hermes-2-Pro-Llama-3-8B",
5
+ "adapter_id": "azma-hermes-pro-llama-3-8b-030524-adapter",
6
+ "chat_template": "chatml",
7
+ "lora_config": {
8
+ "lora_rank": 16,
9
+ "lora_alpha": 32,
10
+ "lora_dropout": 0.1,
11
+ "target_modules": [
12
+ "k_proj",
13
+ "v_proj",
14
+ "o_proj",
15
+ "q_proj",
16
+ "up_proj",
17
+ "gate_proj",
18
+ "down_proj"
19
+ ],
20
+ "modules_to_save": [
21
+ "lm_head"
22
+ ],
23
+ "task_type": "CAUSAL_LM"
24
+ },
25
+ "load_config": {
26
+ "use_flash_attention": true,
27
+ "load_in_4bit": false,
28
+ "cache_dir": "./",
29
+ "torch_dtype": "auto",
30
+ "device_map": "auto",
31
+ "pad_to_multiples": false
32
+ },
33
+ "dataset_config": {
34
+ "dataset_id": [
35
+ "Azma-AI/azma-mermaid-dataset-single-turn-chatml",
36
+ "Azma-AI/azma-dataset-v2-mermaid-without-thoughts-final-chatml-8192-seq-len"
37
+ ],
38
+ "max_seq_length": 8192,
39
+ "shuffle": true,
40
+ "data_collator": "default",
41
+ "template_version": 1.0,
42
+ "dataset_text_field": "text",
43
+ "add_custom_tokens": {
44
+ "bos_token": "<|begin_of_text|>",
45
+ "eos_token": "<|im_end|>",
46
+ "pad_token": "<|end_of_text|>"
47
+ }
48
+ },
49
+ "dpo_args": {
50
+ "beta": 0.6,
51
+ "loss_types": "sigmoid"
52
+ },
53
+ "training_args": {
54
+ "seed": 42,
55
+ "max_steps": -1,
56
+ "weight_decay": 0.01,
57
+ "num_train_epochs": 1,
58
+ "learning_rate": 1e-05,
59
+ "output_dir": "./results",
60
+ "optim": "paged_adamw_32bit",
61
+ "gradient_accumulation_steps": 2,
62
+ "per_device_train_batch_size": 8,
63
+ "per_device_eval_batch_size": 2,
64
+ "group_by_length": false,
65
+ "resume_from_checkpoint": true,
66
+ "gradient_checkpointing": true,
67
+ "gradient_checkpointing_kwargs": null,
68
+ "do_eval": false,
69
+ "eval_steps": 100,
70
+ "evaluation_strategy": "steps",
71
+ "save_steps": 100,
72
+ "save_total_limit": 2,
73
+ "save_strategy": "steps",
74
+ "logging_steps": 5,
75
+ "log_level": "info",
76
+ "logging_strategy": "steps",
77
+ "push_to_hub": true,
78
+ "hub_model_id": "Azma-AI/azma-hermes-pro-llama-3-8b-030524-adapter",
79
+ "hub_private_repo": true,
80
+ "hub_strategy": "checkpoint",
81
+ "report_to": "wandb",
82
+ "warmup_steps": 5,
83
+ "neftune_noise_alpha": 5,
84
+ "lr_scheduler_type": "cosine",
85
+ "auto_find_batch_size": true,
86
+ "load_best_model_at_end": true,
87
+ "deepspeed": null,
88
+ "bf16": true,
89
+ "fp16": false
90
+ },
91
+ "merge_final_model": true,
92
+ "push_to_organization": true
93
+ }