basiliskinstitute commited on
Commit
945cbd6
1 Parent(s): e1b7c2b

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +101 -1
README.md CHANGED
@@ -24,4 +24,104 @@ Chaser-cz/sonnet35-charcard-roleplay-sharegpt
24
 
25
  And some various other data, viewable at openerotica/mixed-rp
26
 
27
- Every line of data was run through a large model in order to filter for low quality, repetition, and underage content.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  And some various other data, viewable at openerotica/mixed-rp
26
 
27
+ Every line of data was run through a large model in order to filter for low quality, repetition, and underage content.
28
+
29
+ [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
30
+ '''
31
+ base_model: mistralai/Mistral-Nemo-Base-2407
32
+ model_type: AutoModelForCausalLM
33
+ tokenizer_type: AutoTokenizer
34
+
35
+ load_in_8bit: false
36
+ load_in_4bit: true
37
+ strict: false
38
+
39
+ datasets:
40
+ - path: openerotica/mixed-rp
41
+ type: sharegpt
42
+ conversation: chatml
43
+
44
+ chat_template: chatml
45
+ adapter: qlora
46
+ lora_r: 128
47
+ lora_alpha: 256
48
+ lora_modules_to_save: [embed_tokens, lm_head]
49
+ lora_dropout: 0.05
50
+ lora_target_linear: true
51
+ lora_target_modules:
52
+ - gate_proj
53
+ - down_proj
54
+ - up_proj
55
+ - q_proj
56
+ - v_proj
57
+ - k_proj
58
+ - o_proj
59
+
60
+ dataset_prepared_path:
61
+ val_set_size: 0.01
62
+ output_dir: /workspace/axolotl/mixed-rp-mistral-nemo
63
+
64
+ sequence_len: 20000
65
+ sample_packing: true
66
+ pad_to_sequence_len: true
67
+
68
+ wandb_project: mistral-2
69
+ wandb_watch:
70
+ wandb_run_id:
71
+ wandb_log_model:
72
+
73
+ gradient_accumulation_steps: 2
74
+ micro_batch_size: 1
75
+ num_epochs: 1
76
+ optimizer: adamw_torch
77
+ lr_scheduler: cosine
78
+ learning_rate: 1e-5
79
+ train_on_inputs: false
80
+ group_by_length: false
81
+ bf16: auto
82
+ fp16:
83
+ tf32: false
84
+
85
+ gradient_checkpointing: true
86
+ gradient_checkpointing_kwargs:
87
+ use_reentrant: false
88
+ early_stopping_patience:
89
+ resume_from_checkpoint:
90
+ logging_steps: 1
91
+ xformers_attention:
92
+ flash_attention: true
93
+
94
+ warmup_steps: 100
95
+ evals_per_epoch: 4
96
+ eval_table_size:
97
+ saves_per_epoch: 1
98
+ save_total_limit: 2
99
+ save_steps:
100
+ debug:
101
+ deepspeed:
102
+ weight_decay: 0.1
103
+ special_tokens:
104
+ eos_token: "<|im_end|>"
105
+ pad_token: "<pad>"
106
+ bos_token: "<s>"
107
+ unk_token: "<unk>"
108
+ tokens:
109
+ - "<|im_start|>"
110
+
111
+
112
+ # fsdp:
113
+ # - full_shard
114
+ # - auto_wrap
115
+ # fsdp_config:
116
+ # fsdp_limit_all_gathers: true
117
+ # fsdp_sync_module_states: true
118
+ # fsdp_offload_params: true
119
+ # fsdp_use_orig_params: false
120
+ # fsdp_cpu_ram_efficient_loading: true
121
+ # fsdp_transformer_layer_cls_to_wrap: MixtralSparseMoeBlock
122
+ # fsdp_state_dict_type: FULL_STATE_DICT
123
+ # fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
124
+ # fsdp_sharding_strategy: FULL_SHARD
125
+ # fsdp_forward_prefetch: false
126
+ # fsdp_backward_prefetch: BACKWARD_PRE
127
+ '''