LoneStriker commited on
Commit
18149aa
1 Parent(s): ad7038f

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ license_name: gemma-terms-of-use
4
+ license_link: https://ai.google.dev/gemma/terms
5
+ base_model: HuggingFaceH4/zephyr-7b-gemma-sft-v0.1
6
+ tags:
7
+ - alignment-handbook
8
+ - trl
9
+ - dpo
10
+ - generated_from_trainer
11
+ datasets:
12
+ - argilla/dpo-mix-7k
13
+ pipeline_tag: text-generation
14
+ model-index:
15
+ - name: zephyr-7b-gemma
16
+ results:
17
+ # MT-Bench (taken from model card)
18
+ - task:
19
+ type: text-generation
20
+ name: Text Generation
21
+ dataset:
22
+ name: MT-Bench
23
+ type: unknown
24
+ metrics:
25
+ - type: unknown
26
+ name: score
27
+ value: 7.81
28
+ source:
29
+ url: https://huggingface.co/spaces/lmsys/mt-bench
30
+ ---
31
+
32
+ <img src="https://huggingface.co/HuggingFaceH4/zephyr-7b-gemma-v0.1/resolve/main/thumbnail.png" alt="Zephyr 7B Gemma Logo" width="800" style="margin-left:'auto' margin-right:'auto' display:'block'"/>
33
+
34
+ # Model Card for Zephyr 7B Gemma
35
+
36
+ Zephyr is a series of language models that are trained to act as helpful assistants. Zephyr 7B Gemma is the third model in the series, and is a fine-tuned version of [`google/gemma-7b`](https://huggingface.co/google/gemma-7b) that was trained on on a mix of publicly available, synthetic datasets using Direct Preference Optimization (DPO). You can reproduce the training of this model via the recipe provided in the [Alignment Handbook](https://github.com/huggingface/alignment-handbook).
37
+
38
+ ## Model description
39
+
40
+ - **Model type:** A 7B parameter GPT-like model fine-tuned on a mix of publicly available, synthetic datasets.
41
+ - **Language(s) (NLP):** Primarily English
42
+ - **License:** Gemma Terms of Use
43
+ - **Finetuned from model:** [google/gemma-7b](https://huggingface.co/google/gemma-7b)
44
+
45
+ ### Model Sources
46
+
47
+ <!-- Provide the basic links for the model. -->
48
+
49
+ - **Repository:** https://github.com/huggingface/alignment-handbook
50
+ - **Demo:** https://huggingface.co/spaces/HuggingFaceH4/zephyr-7b-gemma-chat
51
+
52
+ ## Performance
53
+
54
+ | Model |MT Bench⬇️|IFEval|
55
+ |-----------------------------------------------------------------------|------:|------:|
56
+ |[zephyr-7b-gemma-v0.1](https://huggingface.co/HuggingFaceH4/zephyr-7b-gemma-v0.1)| 7.81 | 28.76|
57
+ |[zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) | 7.34 | 43.81|
58
+ |[google/gemma-7b-it](https://huggingface.co/google/gemma-7b-it) | 6.38 | 38.01|
59
+
60
+
61
+
62
+ | Model |AGIEval|GPT4All|TruthfulQA|BigBench|Average ⬇️|
63
+ |-----------------------------------------------------------------------|------:|------:|---------:|-------:|------:|
64
+ |[zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) | 37.52| 71.77| 55.26| 39.77| 51.08|
65
+ |[zephyr-7b-gemma-v0.1](https://huggingface.co/HuggingFaceH4/zephyr-7b-gemma-v0.1)| 34.22| 66.37| 52.19| 37.10| 47.47|
66
+ |[mlabonne/Gemmalpaca-7B](https://huggingface.co/mlabonne/Gemmalpaca-7B)| 21.6 | 40.87| 44.85 | 30.49| 34.45|
67
+ |[google/gemma-7b-it](https://huggingface.co/google/gemma-7b-it) | 21.33| 40.84| 41.70| 30.25| 33.53|
68
+
69
+
70
+ <details><summary>Details of AGIEval, GPT4All, TruthfulQA, BigBench </summary>
71
+
72
+ ### AGIEval
73
+ | Task |Version| Metric |Value| |Stderr|
74
+ |------------------------------|------:|--------|----:|---|-----:|
75
+ |agieval_aqua_rat | 0|acc |21.65|± | 2.59|
76
+ | | |acc_norm|25.20|± | 2.73|
77
+ |agieval_logiqa_en | 0|acc |34.72|± | 1.87|
78
+ | | |acc_norm|35.94|± | 1.88|
79
+ |agieval_lsat_ar | 0|acc |19.57|± | 2.62|
80
+ | | |acc_norm|21.74|± | 2.73|
81
+ |agieval_lsat_lr | 0|acc |30.59|± | 2.04|
82
+ | | |acc_norm|32.55|± | 2.08|
83
+ |agieval_lsat_rc | 0|acc |49.07|± | 3.05|
84
+ | | |acc_norm|42.75|± | 3.02|
85
+ |agieval_sat_en | 0|acc |54.85|± | 3.48|
86
+ | | |acc_norm|53.40|± | 3.48|
87
+ |agieval_sat_en_without_passage| 0|acc |37.38|± | 3.38|
88
+ | | |acc_norm|33.98|± | 3.31|
89
+ |agieval_sat_math | 0|acc |30.91|± | 3.12|
90
+ | | |acc_norm|28.18|± | 3.04|
91
+
92
+ Average: 34.22%
93
+
94
+ ### GPT4All
95
+ | Task |Version| Metric |Value| |Stderr|
96
+ |-------------|------:|--------|----:|---|-----:|
97
+ |arc_challenge| 0|acc |49.15|± | 1.46|
98
+ | | |acc_norm|52.47|± | 1.46|
99
+ |arc_easy | 0|acc |77.44|± | 0.86|
100
+ | | |acc_norm|74.75|± | 0.89|
101
+ |boolq | 1|acc |79.69|± | 0.70|
102
+ |hellaswag | 0|acc |60.59|± | 0.49|
103
+ | | |acc_norm|78.00|± | 0.41|
104
+ |openbookqa | 0|acc |29.20|± | 2.04|
105
+ | | |acc_norm|37.80|± | 2.17|
106
+ |piqa | 0|acc |76.82|± | 0.98|
107
+ | | |acc_norm|77.80|± | 0.97|
108
+ |winogrande | 0|acc |64.09|± | 1.35|
109
+
110
+ Average: 66.37%
111
+
112
+ ### TruthfulQA
113
+ | Task |Version|Metric|Value| |Stderr|
114
+ |-------------|------:|------|----:|---|-----:|
115
+ |truthfulqa_mc| 1|mc1 |35.74|± | 1.68|
116
+ | | |mc2 |52.19|± | 1.59|
117
+
118
+ Average: 52.19%
119
+
120
+ ### Bigbench
121
+ | Task |Version| Metric |Value| |Stderr|
122
+ |------------------------------------------------|------:|---------------------|----:|---|-----:|
123
+ |bigbench_causal_judgement | 0|multiple_choice_grade|53.68|± | 3.63|
124
+ |bigbench_date_understanding | 0|multiple_choice_grade|59.89|± | 2.55|
125
+ |bigbench_disambiguation_qa | 0|multiple_choice_grade|30.23|± | 2.86|
126
+ |bigbench_geometric_shapes | 0|multiple_choice_grade|11.42|± | 1.68|
127
+ | | |exact_str_match | 0.00|± | 0.00|
128
+ |bigbench_logical_deduction_five_objects | 0|multiple_choice_grade|28.40|± | 2.02|
129
+ |bigbench_logical_deduction_seven_objects | 0|multiple_choice_grade|19.14|± | 1.49|
130
+ |bigbench_logical_deduction_three_objects | 0|multiple_choice_grade|44.67|± | 2.88|
131
+ |bigbench_movie_recommendation | 0|multiple_choice_grade|26.80|± | 1.98|
132
+ |bigbench_navigate | 0|multiple_choice_grade|50.00|± | 1.58|
133
+ |bigbench_reasoning_about_colored_objects | 0|multiple_choice_grade|52.75|± | 1.12|
134
+ |bigbench_ruin_names | 0|multiple_choice_grade|33.04|± | 2.22|
135
+ |bigbench_salient_translation_error_detection | 0|multiple_choice_grade|33.37|± | 1.49|
136
+ |bigbench_snarks | 0|multiple_choice_grade|48.62|± | 3.73|
137
+ |bigbench_sports_understanding | 0|multiple_choice_grade|58.11|± | 1.57|
138
+ |bigbench_temporal_sequences | 0|multiple_choice_grade|37.20|± | 1.53|
139
+ |bigbench_tracking_shuffled_objects_five_objects | 0|multiple_choice_grade|20.08|± | 1.13|
140
+ |bigbench_tracking_shuffled_objects_seven_objects| 0|multiple_choice_grade|15.77|± | 0.87|
141
+ |bigbench_tracking_shuffled_objects_three_objects| 0|multiple_choice_grade|44.67|± | 2.88|
142
+
143
+ Average: 37.1%
144
+
145
+ </details>
146
+
147
+
148
+ ## Intended uses & limitations
149
+
150
+ The model was initially fine-tuned on the [DEITA 10K](https://huggingface.co/datasets/HuggingFaceH4/deita-10k-v0-sft) dataset, which contains a diverse range of synthetic dialogues generated by ChatGPT.
151
+ We then further aligned the model with [🤗 TRL's](https://github.com/huggingface/trl) `DPOTrainer` on the [argilla/dpo-mix-7k](https://huggingface.co/datasets/argilla/dpo-mix-7k) dataset, which contains 7k prompts and model completions that are ranked by GPT-4. As a result, the model can be used for chat and you can check out our [demo](https://huggingface.co/spaces/HuggingFaceH4/zephyr-chat) to test its capabilities.
152
+
153
+ Here's how you can run the model using the `pipeline()` function from 🤗 Transformers:
154
+
155
+ ```python
156
+ # pip install transformers>=4.38.2
157
+ # pip install accelerate
158
+
159
+ import torch
160
+ from transformers import pipeline
161
+
162
+ pipe = pipeline(
163
+ "text-generation",
164
+ model="HuggingFaceH4/zephyr-7b-gemma-v0.1",
165
+ device_map="auto",
166
+ torch_dtype=torch.bfloat16,
167
+ )
168
+ messages = [
169
+ {
170
+ "role": "system",
171
+ "content": "", # Model not yet trained for follow this
172
+ },
173
+ {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
174
+ ]
175
+ outputs = pipe(
176
+ messages,
177
+ max_new_tokens=128,
178
+ do_sample=True,
179
+ temperature=0.7,
180
+ top_k=50,
181
+ top_p=0.95,
182
+ stop_sequence="<|im_end|>",
183
+ )
184
+ print(outputs[0]["generated_text"][-1]["content"])
185
+ # It is not possible for a human to eat a helicopter in one sitting, as a
186
+ # helicopter is a large and inedible machine. Helicopters are made of metal,
187
+ # plastic, and other materials that are not meant to be consumed by humans.
188
+ # Eating a helicopter would be extremely dangerous and would likely cause
189
+ # serious health problems, including choking, suffocation, and poisoning. It is
190
+ # important to only eat food that is safe and intended for human consumption.
191
+ ```
192
+
193
+ ## Bias, Risks, and Limitations
194
+
195
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
196
+
197
+ Zephyr 7B Gemma has not been aligned to human preferences for safety within the RLHF phase or deployed with in-the-loop filtering of responses like ChatGPT, so the model can produce problematic outputs (especially when prompted to do so). It is also unknown what the size and composition of the corpus was used to train the base model (`google/gemma-7b`), however it is likely to have included a mix of Web data and technical sources like books and code. See the [StarCoder2 model card](https://huggingface.co/bigcode/starcoder2-15b) for an example of this.
198
+
199
+
200
+ ## Training and evaluation data
201
+
202
+
203
+ This model is a fine-tuned version of [HuggingFaceH4/zephyr-7b-gemma-sft-v0.1](https://huggingface.co/HuggingFaceH4/zephyr-7b-gemma-sft-v0.1) on the argilla/dpo-mix-7k dataset.
204
+
205
+ It achieves the following results on the evaluation set:
206
+ - Loss: 0.4695
207
+ - Rewards/chosen: -3.3746
208
+ - Rewards/rejected: -4.9715
209
+ - Rewards/accuracies: 0.7188
210
+ - Rewards/margins: 1.5970
211
+ - Logps/rejected: -459.4853
212
+ - Logps/chosen: -429.9115
213
+ - Logits/rejected: 86.4684
214
+ - Logits/chosen: 92.8200
215
+
216
+ ### Training hyperparameters
217
+
218
+ The following hyperparameters were used during training:
219
+ - learning_rate: 5e-07
220
+ - train_batch_size: 2
221
+ - eval_batch_size: 4
222
+ - seed: 42
223
+ - distributed_type: multi-GPU
224
+ - num_devices: 8
225
+ - gradient_accumulation_steps: 8
226
+ - total_train_batch_size: 128
227
+ - total_eval_batch_size: 32
228
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
229
+ - lr_scheduler_type: cosine
230
+ - lr_scheduler_warmup_ratio: 0.1
231
+ - num_epochs: 2
232
+
233
+ ### Training results
234
+
235
+ | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
236
+ |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
237
+ | 0.1923 | 1.9 | 100 | 0.4736 | -3.4575 | -4.9556 | 0.75 | 1.4980 | -459.1662 | -431.5707 | 86.3863 | 92.7360 |
238
+
239
+
240
+ ### Framework versions
241
+
242
+ - Transformers 4.39.0.dev0
243
+ - Pytorch 2.1.2+cu121
244
+ - Datasets 2.14.6
245
+ - Tokenizers 0.15.1
246
+
247
+ ## Citation Information
248
+
249
+ If you find this model useful in your work, please consider citing the Zephyr technical report:
250
+
251
+ ```
252
+ @misc{tunstall2023zephyr,
253
+ title={Zephyr: Direct Distillation of LM Alignment},
254
+ author={Lewis Tunstall and Edward Beeching and Nathan Lambert and Nazneen Rajani and Kashif Rasul and Younes Belkada and Shengyi Huang and Leandro von Werra and Clémentine Fourrier and Nathan Habib and Nathan Sarrazin and Omar Sanseviero and Alexander M. Rush and Thomas Wolf},
255
+ year={2023},
256
+ eprint={2310.16944},
257
+ archivePrefix={arXiv},
258
+ primaryClass={cs.LG}
259
+ }
260
+ ```
261
+
262
+ You may also wish to cite the creators of this model as well:
263
+
264
+ ```
265
+ @misc{zephyr_7b_gemma,
266
+ author = {Lewis Tunstall and Philipp Schmid},
267
+ title = {Zephyr 7B Gemma},
268
+ year = {2024},
269
+ publisher = {Hugging Face},
270
+ journal = {Hugging Face repository},
271
+ howpublished = {\url{https://huggingface.co/HuggingFaceH4/zephyr-7b-gemma-v0.1}}
272
+ }
273
+ ```
all_results.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.97,
3
+ "eval_logits/chosen": 92.81997680664062,
4
+ "eval_logits/rejected": 86.46841430664062,
5
+ "eval_logps/chosen": -429.9114685058594,
6
+ "eval_logps/rejected": -459.4852600097656,
7
+ "eval_loss": 0.4695254862308502,
8
+ "eval_rewards/accuracies": 0.71875,
9
+ "eval_rewards/chosen": -3.3745555877685547,
10
+ "eval_rewards/margins": 1.5969535112380981,
11
+ "eval_rewards/rejected": -4.9715094566345215,
12
+ "eval_runtime": 52.4051,
13
+ "eval_samples": 750,
14
+ "eval_samples_per_second": 14.312,
15
+ "eval_steps_per_second": 0.458,
16
+ "train_loss": 0.38887147261546207,
17
+ "train_runtime": 1183.8142,
18
+ "train_samples": 6750,
19
+ "train_samples_per_second": 11.404,
20
+ "train_steps_per_second": 0.088
21
+ }
config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "lewtun/zephyr-7b-gemma-sft",
3
+ "architectures": [
4
+ "GemmaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 2,
9
+ "eos_token_id": 1,
10
+ "head_dim": 256,
11
+ "hidden_act": "gelu",
12
+ "hidden_size": 3072,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 24576,
15
+ "max_position_embeddings": 8192,
16
+ "model_type": "gemma",
17
+ "num_attention_heads": 16,
18
+ "num_hidden_layers": 28,
19
+ "num_key_value_heads": 16,
20
+ "pad_token_id": 0,
21
+ "rms_norm_eps": 1e-06,
22
+ "rope_scaling": null,
23
+ "rope_theta": 10000.0,
24
+ "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.39.0.dev0",
26
+ "use_cache": true,
27
+ "vocab_size": 256000
28
+ }
eval_results.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.97,
3
+ "eval_logits/chosen": 92.81997680664062,
4
+ "eval_logits/rejected": 86.46841430664062,
5
+ "eval_logps/chosen": -429.9114685058594,
6
+ "eval_logps/rejected": -459.4852600097656,
7
+ "eval_loss": 0.4695254862308502,
8
+ "eval_rewards/accuracies": 0.71875,
9
+ "eval_rewards/chosen": -3.3745555877685547,
10
+ "eval_rewards/margins": 1.5969535112380981,
11
+ "eval_rewards/rejected": -4.9715094566345215,
12
+ "eval_runtime": 52.4051,
13
+ "eval_samples": 750,
14
+ "eval_samples_per_second": 14.312,
15
+ "eval_steps_per_second": 0.458
16
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 2,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.39.0.dev0"
7
+ }
model.safetensors.index.json ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 17075361792
4
+ },
5
+ "weight_map": {
6
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
7
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
8
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
9
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
10
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
15
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
17
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
18
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
19
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
20
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
21
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
22
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
23
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
24
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
25
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
26
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
27
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
28
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
29
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
30
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
31
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
32
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
33
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
34
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
35
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
36
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
37
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
38
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
39
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
40
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
41
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
42
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
43
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
44
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
45
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
46
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
47
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
48
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
49
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
50
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
51
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
52
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
53
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
54
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
55
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
56
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
57
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
58
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
59
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
60
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
61
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
62
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
63
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
64
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
65
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
66
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
67
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
68
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
69
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
70
+ "model.layers.15.input_layernorm.weight": "model-00003-of-00004.safetensors",
71
+ "model.layers.15.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
72
+ "model.layers.15.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
73
+ "model.layers.15.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
74
+ "model.layers.15.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
75
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
76
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
77
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
78
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
79
+ "model.layers.16.input_layernorm.weight": "model-00003-of-00004.safetensors",
80
+ "model.layers.16.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
81
+ "model.layers.16.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
82
+ "model.layers.16.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
83
+ "model.layers.16.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
84
+ "model.layers.16.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
85
+ "model.layers.16.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
86
+ "model.layers.16.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
87
+ "model.layers.16.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
88
+ "model.layers.17.input_layernorm.weight": "model-00003-of-00004.safetensors",
89
+ "model.layers.17.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
90
+ "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
91
+ "model.layers.17.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
92
+ "model.layers.17.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
93
+ "model.layers.17.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
94
+ "model.layers.17.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
95
+ "model.layers.17.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
96
+ "model.layers.17.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
97
+ "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
98
+ "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
99
+ "model.layers.18.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
100
+ "model.layers.18.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
101
+ "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
102
+ "model.layers.18.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
103
+ "model.layers.18.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
104
+ "model.layers.18.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
105
+ "model.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
106
+ "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
107
+ "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
108
+ "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
109
+ "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
110
+ "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
111
+ "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
112
+ "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
113
+ "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
114
+ "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
115
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
116
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
117
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
118
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
119
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
120
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
121
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
122
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
123
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
124
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
125
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
126
+ "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
127
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
128
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
129
+ "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
130
+ "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
131
+ "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
132
+ "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
133
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
134
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
135
+ "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
136
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
137
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
138
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
139
+ "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
140
+ "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
141
+ "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
142
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
143
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
144
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
145
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
146
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
147
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
148
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
149
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
150
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
151
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
152
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
153
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
154
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
155
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
156
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
157
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
158
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
159
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
160
+ "model.layers.24.input_layernorm.weight": "model-00004-of-00004.safetensors",
161
+ "model.layers.24.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
162
+ "model.layers.24.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
163
+ "model.layers.24.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
164
+ "model.layers.24.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
165
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
166
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
167
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
168
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
169
+ "model.layers.25.input_layernorm.weight": "model-00004-of-00004.safetensors",
170
+ "model.layers.25.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
171
+ "model.layers.25.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
172
+ "model.layers.25.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
173
+ "model.layers.25.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
174
+ "model.layers.25.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
175
+ "model.layers.25.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
176
+ "model.layers.25.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
177
+ "model.layers.25.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
178
+ "model.layers.26.input_layernorm.weight": "model-00004-of-00004.safetensors",
179
+ "model.layers.26.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
180
+ "model.layers.26.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
181
+ "model.layers.26.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
182
+ "model.layers.26.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
183
+ "model.layers.26.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
184
+ "model.layers.26.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
185
+ "model.layers.26.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
186
+ "model.layers.26.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
187
+ "model.layers.27.input_layernorm.weight": "model-00004-of-00004.safetensors",
188
+ "model.layers.27.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
189
+ "model.layers.27.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
190
+ "model.layers.27.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
191
+ "model.layers.27.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
192
+ "model.layers.27.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
193
+ "model.layers.27.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
194
+ "model.layers.27.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
195
+ "model.layers.27.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
196
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
197
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
198
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
199
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
200
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
201
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
202
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
203
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
204
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
205
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
206
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
207
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
208
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
209
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
210
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
211
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
212
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
213
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
214
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
215
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
216
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
217
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
218
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
219
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
220
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
221
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
222
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
223
+ "model.layers.6.input_layernorm.weight": "model-00002-of-00004.safetensors",
224
+ "model.layers.6.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
225
+ "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
226
+ "model.layers.6.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
227
+ "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
228
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
229
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
230
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
231
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
232
+ "model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors",
233
+ "model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
234
+ "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
235
+ "model.layers.7.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
236
+ "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
237
+ "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
238
+ "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
239
+ "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
240
+ "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
241
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
242
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
243
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
244
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
245
+ "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
246
+ "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
247
+ "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
248
+ "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
249
+ "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
250
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
251
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
252
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
253
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
254
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
255
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
256
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
257
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
258
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
259
+ "model.norm.weight": "model-00004-of-00004.safetensors"
260
+ }
261
+ }
output-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf586e941391b309b9d9cca7ef093ee0a4ce51e506b0ad85f282ed34c4a2fb26
3
+ size 8543943248
output-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25d274309aff742e170afce4876aaa4f42d39e80b235fce0b1ffd67fb6f48c59
3
+ size 1563370576
special_tokens_map.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "bos_token": {
7
+ "content": "<bos>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "eos_token": {
14
+ "content": "<eos>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "pad_token": {
21
+ "content": "<pad>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "unk_token": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ }
34
+ }
thumbnail.png ADDED
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22449cb9ef4bad0db7dd93b46ddff7ab7d6a654dd4f903e130ddb6361eac3af5
3
+ size 17477473
tokenizer_config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<pad>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<eos>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "<bos>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "3": {
30
+ "content": "<unk>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "106": {
38
+ "content": "<|im_start|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "107": {
46
+ "content": "<|im_end|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ }
53
+ },
54
+ "additional_special_tokens": [
55
+ "<|im_start|>",
56
+ "<|im_end|>"
57
+ ],
58
+ "bos_token": "<bos>",
59
+ "chat_template": "{% if messages[0]['role'] == 'user' or messages[0]['role'] == 'system' %}{{ bos_token }}{% endif %}{% for message in messages %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% elif messages[-1]['role'] == 'assistant' %}{{ eos_token }}{% endif %}",
60
+ "clean_up_tokenization_spaces": false,
61
+ "eos_token": "<eos>",
62
+ "legacy": null,
63
+ "model_max_length": 2048,
64
+ "pad_token": "<pad>",
65
+ "sp_model_kwargs": {},
66
+ "spaces_between_special_tokens": false,
67
+ "tokenizer_class": "GemmaTokenizer",
68
+ "unk_token": "<unk>",
69
+ "use_default_system_prompt": false
70
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.97,
3
+ "train_loss": 0.38887147261546207,
4
+ "train_runtime": 1183.8142,
5
+ "train_samples": 6750,
6
+ "train_samples_per_second": 11.404,
7
+ "train_steps_per_second": 0.088
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.971563981042654,
5
+ "eval_steps": 100,
6
+ "global_step": 104,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.02,
13
+ "grad_norm": 139.638709617328,
14
+ "learning_rate": 4.545454545454545e-08,
15
+ "logits/chosen": 111.16130065917969,
16
+ "logits/rejected": 86.8372802734375,
17
+ "logps/chosen": -326.8536071777344,
18
+ "logps/rejected": -329.15960693359375,
19
+ "loss": 0.6931,
20
+ "rewards/accuracies": 0.0,
21
+ "rewards/chosen": 0.0,
22
+ "rewards/margins": 0.0,
23
+ "rewards/rejected": 0.0,
24
+ "step": 1
25
+ },
26
+ {
27
+ "epoch": 0.19,
28
+ "grad_norm": 141.5345140695996,
29
+ "learning_rate": 4.545454545454545e-07,
30
+ "logits/chosen": 110.37065124511719,
31
+ "logits/rejected": 133.2639923095703,
32
+ "logps/chosen": -350.3541259765625,
33
+ "logps/rejected": -434.3558349609375,
34
+ "loss": 0.7191,
35
+ "rewards/accuracies": 0.4722222089767456,
36
+ "rewards/chosen": 0.13274627923965454,
37
+ "rewards/margins": 0.07573667168617249,
38
+ "rewards/rejected": 0.05700961872935295,
39
+ "step": 10
40
+ },
41
+ {
42
+ "epoch": 0.38,
43
+ "grad_norm": 123.71909837085582,
44
+ "learning_rate": 4.885348141000122e-07,
45
+ "logits/chosen": 117.74342346191406,
46
+ "logits/rejected": 128.52548217773438,
47
+ "logps/chosen": -333.21240234375,
48
+ "logps/rejected": -410.2923889160156,
49
+ "loss": 0.6097,
50
+ "rewards/accuracies": 0.7124999761581421,
51
+ "rewards/chosen": 0.11470325291156769,
52
+ "rewards/margins": 0.7479402422904968,
53
+ "rewards/rejected": -0.6332370042800903,
54
+ "step": 20
55
+ },
56
+ {
57
+ "epoch": 0.57,
58
+ "grad_norm": 111.89651526533274,
59
+ "learning_rate": 4.5025027361734613e-07,
60
+ "logits/chosen": 114.44095611572266,
61
+ "logits/rejected": 119.11683654785156,
62
+ "logps/chosen": -399.1412048339844,
63
+ "logps/rejected": -474.2645568847656,
64
+ "loss": 0.596,
65
+ "rewards/accuracies": 0.7250000238418579,
66
+ "rewards/chosen": -1.7276217937469482,
67
+ "rewards/margins": 1.0803521871566772,
68
+ "rewards/rejected": -2.807974100112915,
69
+ "step": 30
70
+ },
71
+ {
72
+ "epoch": 0.76,
73
+ "grad_norm": 102.67088507130228,
74
+ "learning_rate": 3.893311157806091e-07,
75
+ "logits/chosen": 116.33101654052734,
76
+ "logits/rejected": 111.0595703125,
77
+ "logps/chosen": -428.7275390625,
78
+ "logps/rejected": -464.0934143066406,
79
+ "loss": 0.5343,
80
+ "rewards/accuracies": 0.7250000238418579,
81
+ "rewards/chosen": -2.2770252227783203,
82
+ "rewards/margins": 0.9522085189819336,
83
+ "rewards/rejected": -3.229233503341675,
84
+ "step": 40
85
+ },
86
+ {
87
+ "epoch": 0.95,
88
+ "grad_norm": 130.9996197198566,
89
+ "learning_rate": 3.126631330646801e-07,
90
+ "logits/chosen": 123.2393569946289,
91
+ "logits/rejected": 124.50789642333984,
92
+ "logps/chosen": -438.548095703125,
93
+ "logps/rejected": -474.1234436035156,
94
+ "loss": 0.5138,
95
+ "rewards/accuracies": 0.762499988079071,
96
+ "rewards/chosen": -2.3258581161499023,
97
+ "rewards/margins": 1.3220884799957275,
98
+ "rewards/rejected": -3.647946834564209,
99
+ "step": 50
100
+ },
101
+ {
102
+ "epoch": 1.14,
103
+ "grad_norm": 56.950942870641875,
104
+ "learning_rate": 2.2891223348923882e-07,
105
+ "logits/chosen": 122.619384765625,
106
+ "logits/rejected": 126.1447525024414,
107
+ "logps/chosen": -414.3634338378906,
108
+ "logps/rejected": -468.19586181640625,
109
+ "loss": 0.2724,
110
+ "rewards/accuracies": 0.893750011920929,
111
+ "rewards/chosen": -2.3773388862609863,
112
+ "rewards/margins": 2.358515501022339,
113
+ "rewards/rejected": -4.735854148864746,
114
+ "step": 60
115
+ },
116
+ {
117
+ "epoch": 1.33,
118
+ "grad_norm": 52.820355390804025,
119
+ "learning_rate": 1.4754491880085317e-07,
120
+ "logits/chosen": 117.16709899902344,
121
+ "logits/rejected": 118.9737319946289,
122
+ "logps/chosen": -387.70526123046875,
123
+ "logps/rejected": -511.97503662109375,
124
+ "loss": 0.1936,
125
+ "rewards/accuracies": 0.9437500238418579,
126
+ "rewards/chosen": -2.4186935424804688,
127
+ "rewards/margins": 2.5914835929870605,
128
+ "rewards/rejected": -5.010177135467529,
129
+ "step": 70
130
+ },
131
+ {
132
+ "epoch": 1.52,
133
+ "grad_norm": 51.657826972971314,
134
+ "learning_rate": 7.775827023107834e-08,
135
+ "logits/chosen": 124.15473937988281,
136
+ "logits/rejected": 125.7086181640625,
137
+ "logps/chosen": -446.75421142578125,
138
+ "logps/rejected": -543.6109619140625,
139
+ "loss": 0.1779,
140
+ "rewards/accuracies": 0.981249988079071,
141
+ "rewards/chosen": -2.316882848739624,
142
+ "rewards/margins": 2.962496757507324,
143
+ "rewards/rejected": -5.279379844665527,
144
+ "step": 80
145
+ },
146
+ {
147
+ "epoch": 1.71,
148
+ "grad_norm": 86.34373603352554,
149
+ "learning_rate": 2.7440387297912122e-08,
150
+ "logits/chosen": 107.07579040527344,
151
+ "logits/rejected": 111.74522399902344,
152
+ "logps/chosen": -425.4237365722656,
153
+ "logps/rejected": -509.67718505859375,
154
+ "loss": 0.1765,
155
+ "rewards/accuracies": 0.9437500238418579,
156
+ "rewards/chosen": -2.749206066131592,
157
+ "rewards/margins": 3.0597147941589355,
158
+ "rewards/rejected": -5.8089213371276855,
159
+ "step": 90
160
+ },
161
+ {
162
+ "epoch": 1.9,
163
+ "grad_norm": 51.66215546933828,
164
+ "learning_rate": 2.27878296044029e-09,
165
+ "logits/chosen": 123.38490295410156,
166
+ "logits/rejected": 113.675537109375,
167
+ "logps/chosen": -439.7268981933594,
168
+ "logps/rejected": -550.8162841796875,
169
+ "loss": 0.1923,
170
+ "rewards/accuracies": 0.9624999761581421,
171
+ "rewards/chosen": -2.560769557952881,
172
+ "rewards/margins": 3.2135703563690186,
173
+ "rewards/rejected": -5.77433967590332,
174
+ "step": 100
175
+ },
176
+ {
177
+ "epoch": 1.9,
178
+ "eval_logits/chosen": 92.73604583740234,
179
+ "eval_logits/rejected": 86.38631439208984,
180
+ "eval_logps/chosen": -431.5707092285156,
181
+ "eval_logps/rejected": -459.1661682128906,
182
+ "eval_loss": 0.4735770523548126,
183
+ "eval_rewards/accuracies": 0.75,
184
+ "eval_rewards/chosen": -3.4575202465057373,
185
+ "eval_rewards/margins": 1.4980329275131226,
186
+ "eval_rewards/rejected": -4.9555535316467285,
187
+ "eval_runtime": 50.3064,
188
+ "eval_samples_per_second": 14.909,
189
+ "eval_steps_per_second": 0.477,
190
+ "step": 100
191
+ },
192
+ {
193
+ "epoch": 1.97,
194
+ "step": 104,
195
+ "total_flos": 0.0,
196
+ "train_loss": 0.38887147261546207,
197
+ "train_runtime": 1183.8142,
198
+ "train_samples_per_second": 11.404,
199
+ "train_steps_per_second": 0.088
200
+ }
201
+ ],
202
+ "logging_steps": 10,
203
+ "max_steps": 104,
204
+ "num_input_tokens_seen": 0,
205
+ "num_train_epochs": 2,
206
+ "save_steps": 500,
207
+ "total_flos": 0.0,
208
+ "train_batch_size": 2,
209
+ "trial_name": null,
210
+ "trial_params": null
211
+ }