llmixer commited on
Commit
4c8cf31
·
verified ·
1 Parent(s): 1fb775a

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +302 -0
README.md ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model:
3
+ - meta-llama/Meta-Llama-3-70B-Instruct
4
+ license: llama3
5
+ language:
6
+ - en
7
+ pipeline_tag: text-generation
8
+ tags:
9
+ - merge
10
+ - frankenmerge
11
+ - 96b
12
+ ---
13
+ # BigWeave v32 96b
14
+
15
+ <img src="https://cdn-uploads.huggingface.co/production/uploads/65a6db055c58475cf9e6def1/4CbbAN-X7ZWj702JrcCGH.png" width=600>
16
+
17
+ The BigWeave models aim to experimentally identify merge settings for increasing model performance. The version number merely tracks various attempts and is not a quality indicator. Only results demonstrating good performance are retained and shared.
18
+
19
+ # Prompting Format
20
+ llamav3
21
+
22
+ # Merge process
23
+ This is a self-merge of meta-llama/Meta-Llama-3-70B-Instruct. Middle layers are duplicated and various matrices are scaled according to the template by jukofyork as shown here: https://github.com/arcee-ai/mergekit/issues/198#issuecomment-2079950009
24
+
25
+ Merge configuration:
26
+ ```
27
+ const_tag: &MODEL meta-llama/Meta-Llama-3-70B-Instruct
28
+
29
+ const_tag: &RESIDUAL_SCALE_FACTOR 0.5
30
+ const_tag: &QK_ATTENUATION_FACTOR 0.7071067812
31
+ const_tag: &OUT_FACTOR 0.9
32
+
33
+ scale-filter-env: &scale_filter_env
34
+ parameters:
35
+ scale:
36
+ - filter: o_proj
37
+ value: *RESIDUAL_SCALE_FACTOR
38
+ - filter: down_proj
39
+ value: *RESIDUAL_SCALE_FACTOR
40
+ - filter: q_proj
41
+ value: *QK_ATTENUATION_FACTOR
42
+ - filter: k_proj
43
+ value: *QK_ATTENUATION_FACTOR
44
+ - filter: v_proj
45
+ value: *OUT_FACTOR
46
+ - filter: up_proj
47
+ value: *OUT_FACTOR
48
+ - value: 1.0
49
+
50
+ slices:
51
+ - sources:
52
+ - model: *MODEL
53
+ layer_range: [0, 25]
54
+
55
+ - sources:
56
+ - model: *MODEL
57
+ layer_range: [25, 26]
58
+ <<: *scale_filter_env
59
+ - sources:
60
+ - model: *MODEL
61
+ layer_range: [25, 26]
62
+ <<: *scale_filter_env
63
+ - sources:
64
+ - model: *MODEL
65
+ layer_range: [26, 27]
66
+ <<: *scale_filter_env
67
+ - sources:
68
+ - model: *MODEL
69
+ layer_range: [26, 27]
70
+ <<: *scale_filter_env
71
+ - sources:
72
+ - model: *MODEL
73
+ layer_range: [27, 28]
74
+ <<: *scale_filter_env
75
+ - sources:
76
+ - model: *MODEL
77
+ layer_range: [27, 28]
78
+ <<: *scale_filter_env
79
+ - sources:
80
+ - model: *MODEL
81
+ layer_range: [28, 29]
82
+ <<: *scale_filter_env
83
+ - sources:
84
+ - model: *MODEL
85
+ layer_range: [28, 29]
86
+ <<: *scale_filter_env
87
+ - sources:
88
+ - model: *MODEL
89
+ layer_range: [29, 30]
90
+ <<: *scale_filter_env
91
+ - sources:
92
+ - model: *MODEL
93
+ layer_range: [29, 30]
94
+ <<: *scale_filter_env
95
+ - sources:
96
+ - model: *MODEL
97
+ layer_range: [30, 31]
98
+ <<: *scale_filter_env
99
+ - sources:
100
+ - model: *MODEL
101
+ layer_range: [30, 31]
102
+ <<: *scale_filter_env
103
+ - sources:
104
+ - model: *MODEL
105
+ layer_range: [31, 32]
106
+ <<: *scale_filter_env
107
+ - sources:
108
+ - model: *MODEL
109
+ layer_range: [31, 32]
110
+ <<: *scale_filter_env
111
+ - sources:
112
+ - model: *MODEL
113
+ layer_range: [32, 33]
114
+ <<: *scale_filter_env
115
+ - sources:
116
+ - model: *MODEL
117
+ layer_range: [32, 33]
118
+ <<: *scale_filter_env
119
+ - sources:
120
+ - model: *MODEL
121
+ layer_range: [33, 34]
122
+ <<: *scale_filter_env
123
+ - sources:
124
+ - model: *MODEL
125
+ layer_range: [33, 34]
126
+ <<: *scale_filter_env
127
+ - sources:
128
+ - model: *MODEL
129
+ layer_range: [34, 35]
130
+ <<: *scale_filter_env
131
+ - sources:
132
+ - model: *MODEL
133
+ layer_range: [34, 35]
134
+ <<: *scale_filter_env
135
+ - sources:
136
+ - model: *MODEL
137
+ layer_range: [35, 36]
138
+ <<: *scale_filter_env
139
+ - sources:
140
+ - model: *MODEL
141
+ layer_range: [35, 36]
142
+ <<: *scale_filter_env
143
+ - sources:
144
+ - model: *MODEL
145
+ layer_range: [36, 37]
146
+ <<: *scale_filter_env
147
+ - sources:
148
+ - model: *MODEL
149
+ layer_range: [36, 37]
150
+ <<: *scale_filter_env
151
+ - sources:
152
+ - model: *MODEL
153
+ layer_range: [37, 38]
154
+ <<: *scale_filter_env
155
+ - sources:
156
+ - model: *MODEL
157
+ layer_range: [37, 38]
158
+ <<: *scale_filter_env
159
+ - sources:
160
+ - model: *MODEL
161
+ layer_range: [38, 39]
162
+ <<: *scale_filter_env
163
+ - sources:
164
+ - model: *MODEL
165
+ layer_range: [38, 39]
166
+ <<: *scale_filter_env
167
+ - sources:
168
+ - model: *MODEL
169
+ layer_range: [39, 40]
170
+ <<: *scale_filter_env
171
+ - sources:
172
+ - model: *MODEL
173
+ layer_range: [39, 40]
174
+ <<: *scale_filter_env
175
+ - sources:
176
+ - model: *MODEL
177
+ layer_range: [40, 41]
178
+ <<: *scale_filter_env
179
+ - sources:
180
+ - model: *MODEL
181
+ layer_range: [40, 41]
182
+ <<: *scale_filter_env
183
+ - sources:
184
+ - model: *MODEL
185
+ layer_range: [41, 42]
186
+ <<: *scale_filter_env
187
+ - sources:
188
+ - model: *MODEL
189
+ layer_range: [41, 42]
190
+ <<: *scale_filter_env
191
+ - sources:
192
+ - model: *MODEL
193
+ layer_range: [42, 43]
194
+ <<: *scale_filter_env
195
+ - sources:
196
+ - model: *MODEL
197
+ layer_range: [42, 43]
198
+ <<: *scale_filter_env
199
+ - sources:
200
+ - model: *MODEL
201
+ layer_range: [43, 44]
202
+ <<: *scale_filter_env
203
+ - sources:
204
+ - model: *MODEL
205
+ layer_range: [43, 44]
206
+ <<: *scale_filter_env
207
+ - sources:
208
+ - model: *MODEL
209
+ layer_range: [44, 45]
210
+ <<: *scale_filter_env
211
+ - sources:
212
+ - model: *MODEL
213
+ layer_range: [44, 45]
214
+ <<: *scale_filter_env
215
+ - sources:
216
+ - model: *MODEL
217
+ layer_range: [45, 46]
218
+ <<: *scale_filter_env
219
+ - sources:
220
+ - model: *MODEL
221
+ layer_range: [45, 46]
222
+ <<: *scale_filter_env
223
+ - sources:
224
+ - model: *MODEL
225
+ layer_range: [46, 47]
226
+ <<: *scale_filter_env
227
+ - sources:
228
+ - model: *MODEL
229
+ layer_range: [46, 47]
230
+ <<: *scale_filter_env
231
+ - sources:
232
+ - model: *MODEL
233
+ layer_range: [47, 48]
234
+ <<: *scale_filter_env
235
+ - sources:
236
+ - model: *MODEL
237
+ layer_range: [47, 48]
238
+ <<: *scale_filter_env
239
+ - sources:
240
+ - model: *MODEL
241
+ layer_range: [48, 49]
242
+ <<: *scale_filter_env
243
+ - sources:
244
+ - model: *MODEL
245
+ layer_range: [48, 49]
246
+ <<: *scale_filter_env
247
+ - sources:
248
+ - model: *MODEL
249
+ layer_range: [49, 50]
250
+ <<: *scale_filter_env
251
+ - sources:
252
+ - model: *MODEL
253
+ layer_range: [49, 50]
254
+ <<: *scale_filter_env
255
+ - sources:
256
+ - model: *MODEL
257
+ layer_range: [50, 51]
258
+ <<: *scale_filter_env
259
+ - sources:
260
+ - model: *MODEL
261
+ layer_range: [50, 51]
262
+ <<: *scale_filter_env
263
+ - sources:
264
+ - model: *MODEL
265
+ layer_range: [51, 52]
266
+ <<: *scale_filter_env
267
+ - sources:
268
+ - model: *MODEL
269
+ layer_range: [51, 52]
270
+ <<: *scale_filter_env
271
+ - sources:
272
+ - model: *MODEL
273
+ layer_range: [52, 53]
274
+ <<: *scale_filter_env
275
+ - sources:
276
+ - model: *MODEL
277
+ layer_range: [52, 53]
278
+ <<: *scale_filter_env
279
+ - sources:
280
+ - model: *MODEL
281
+ layer_range: [53, 54]
282
+ <<: *scale_filter_env
283
+ - sources:
284
+ - model: *MODEL
285
+ layer_range: [53, 54]
286
+ <<: *scale_filter_env
287
+ - sources:
288
+ - model: *MODEL
289
+ layer_range: [54, 55]
290
+ <<: *scale_filter_env
291
+ - sources:
292
+ - model: *MODEL
293
+ layer_range: [54, 55]
294
+ <<: *scale_filter_env
295
+
296
+ - sources:
297
+ - model: *MODEL
298
+ layer_range: [55, 80]
299
+
300
+ merge_method: passthrough
301
+ dtype: float16
302
+ ```