lapp0 commited on
Commit
635c27b
·
verified ·
1 Parent(s): 6c6899a

End of training

Browse files
README.md CHANGED
@@ -77,7 +77,7 @@ LlamaForCausalLM(
77
 
78
  # Resource Usage
79
 
80
- - Max Train VRAM Use: 13.1269 GB
81
  - Available VRAM: 23.4329 GB
82
  - GPUs:
83
  - 1x NVIDIA GeForce RTX 4090
@@ -107,28 +107,6 @@ LlamaForCausalLM(
107
  (self_attn): LlamaSdpaAttention(
108
  (q_proj): Linear(in_features=576, out_features=576, bias=False)
109
  (k_proj): Linear(in_features=576, out_features=192, bias=False)
110
- @@ -10,17 +10,16 @@
111
- (o_proj): Linear(in_features=576, out_features=576, bias=False)
112
- (rotary_emb): LlamaRotaryEmbedding()
113
- )
114
- - (mlp): LlamaMLP(
115
- + (mlp): LigerSwiGLUMLP(
116
- (gate_proj): Linear(in_features=576, out_features=1536, bias=False)
117
- (up_proj): Linear(in_features=576, out_features=1536, bias=False)
118
- (down_proj): Linear(in_features=1536, out_features=576, bias=False)
119
- - (act_fn): SiLU()
120
- )
121
- - (input_layernorm): LlamaRMSNorm((576,), eps=1e-05)
122
- - (post_attention_layernorm): LlamaRMSNorm((576,), eps=1e-05)
123
- + (input_layernorm): LigerRMSNorm((576,), eps=1e-05, offset=0.0)
124
- + (post_attention_layernorm): LigerRMSNorm((576,), eps=1e-05, offset=0.0)
125
- )
126
- )
127
- - (norm): LlamaRMSNorm((576,), eps=1e-05)
128
- + (norm): LigerRMSNorm((576,), eps=1e-05, offset=0.0)
129
- (rotary_emb): LlamaRotaryEmbedding()
130
- )
131
- (lm_head): Linear(in_features=576, out_features=49152, bias=False)
132
 
133
  ```
134
 
@@ -136,7 +114,7 @@ LlamaForCausalLM(
136
  <br/>
137
 
138
  # Train Dataset
139
- Trained on 553,295,062 tokens from the [wikimedia/wikipedia](https://huggingface.co/datasets/wikimedia/wikipedia) dataset.
140
 
141
  - Num Samples: `998,000`
142
  - Subset: `20231101.en`
@@ -186,7 +164,7 @@ The following hyperparameters were used during training:
186
  weight=0
187
  )
188
  )`
189
- - lr_scheduler: `<torch.optim.lr_scheduler.LambdaLR object at 0x7520ce1738b0>`
190
  - student_model_name_or_path: `None`
191
  - student_config_name_or_path: `None`
192
  - student_model_config: `{'num_hidden_layers': 15}`
 
77
 
78
  # Resource Usage
79
 
80
+ - Max Train VRAM Use: 13.1273 GB
81
  - Available VRAM: 23.4329 GB
82
  - GPUs:
83
  - 1x NVIDIA GeForce RTX 4090
 
107
  (self_attn): LlamaSdpaAttention(
108
  (q_proj): Linear(in_features=576, out_features=576, bias=False)
109
  (k_proj): Linear(in_features=576, out_features=192, bias=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
  ```
112
 
 
114
  <br/>
115
 
116
  # Train Dataset
117
+ Trained on 553,266,374 tokens from the [wikimedia/wikipedia](https://huggingface.co/datasets/wikimedia/wikipedia) dataset.
118
 
119
  - Num Samples: `998,000`
120
  - Subset: `20231101.en`
 
164
  weight=0
165
  )
166
  )`
167
+ - lr_scheduler: `<torch.optim.lr_scheduler.LambdaLR object at 0x7520d65d6590>`
168
  - student_model_name_or_path: `None`
169
  - student_config_name_or_path: `None`
170
  - student_model_config: `{'num_hidden_layers': 15}`
logs/learning_rate=0.0001, lr_scheduler_kwargs=__power___1.0___lr_end___2e-05_, lr_scheduler_type=polynomial, per_device_train_batch_size=8, warmup_ratio=0.1/events.out.tfevents.1726757722.1c1a426a2fee ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02b27b354982aa85e8c2b5613f57f61a261cce711d9ff8c48638bb6ef88daf44
3
+ size 529