Files changed (1) hide show
  1. Goku-8x22B-v0.1.yaml +81 -0
Goku-8x22B-v0.1.yaml ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: v2ray/Mixtral-8x22B-v0.1
2
+ model_type: AutoModelForCausalLM
3
+ tokenizer_type: LlamaTokenizer
4
+ trust_remote_code: true
5
+
6
+ load_in_8bit: false
7
+ load_in_4bit: true
8
+ strict: false
9
+
10
+ datasets:
11
+ - path: philschmid/guanaco-sharegpt-style
12
+ type: sharegpt
13
+ prompt_style: chatml
14
+
15
+ dataset_prepared_path: last_run_prepared
16
+ val_set_size: 0
17
+ output_dir: ./models/Goku-8x22B-v0.1
18
+
19
+ ## You can optionally freeze the entire model and unfreeze a subset of parameters
20
+ unfrozen_parameters:
21
+ # - ^lm_head.weight$
22
+ # - ^model.embed_tokens.weight$[:32000]
23
+ # - model.layers.2[0-9]+.block_sparse_moe.gate
24
+ # - model.layers.2[0-9]+.block_sparse_moe.experts
25
+ # - model.layers.3[0-9]+.block_sparse_moe.gate
26
+ # - model.layers.3[0-9]+.block_sparse_moe.experts
27
+
28
+ model_config:
29
+ output_router_logits: true
30
+
31
+ sequence_len: 2048
32
+ sample_packing: false
33
+ pad_to_sequence_len: true
34
+
35
+ adapter: qlora
36
+ lora_model_dir:
37
+
38
+ lora_r: 16
39
+ lora_alpha: 8
40
+ lora_dropout: 0.05
41
+ lora_target_modules:
42
+ lora_target_linear: true
43
+ lora_fan_in_fan_out:
44
+
45
+ gradient_accumulation_steps: 4
46
+ micro_batch_size: 6
47
+ num_epochs: 1
48
+
49
+ optimizer: paged_adamw_8bit
50
+ lr_scheduler: cosine
51
+ learning_rate: 0.0002
52
+
53
+ train_on_inputs:
54
+ group_by_length: false
55
+ bf16: auto
56
+ fp16: false
57
+ tf32: false
58
+
59
+ gradient_checkpointing: true
60
+ early_stopping_patience:
61
+ resume_from_checkpoint:
62
+ local_rank:
63
+ logging_steps: 1
64
+ xformers_attention:
65
+ flash_attention: true
66
+
67
+ loss_watchdog_threshold: 5.0
68
+ loss_watchdog_patience: 3
69
+
70
+ warmup_steps: 10
71
+ evals_per_epoch: 4
72
+ eval_table_size:
73
+ eval_max_new_tokens: 128
74
+ saves_per_epoch: 1
75
+ debug:
76
+ weight_decay: 0.0
77
+
78
+ special_tokens:
79
+ eos_token: "<|im_end|>"
80
+ tokens:
81
+ - "<|im_start|>"