tanglumy
commited on
Commit
·
1a6c70c
1
Parent(s):
b0723db
initial commit of model weights
Browse files- gpu_training/best_model.pt +3 -0
- gpu_training/checkpoint_0.pt +3 -0
- gpu_training/checkpoint_1000.pt +3 -0
- gpu_training/checkpoint_2000.pt +3 -0
- gpu_training/checkpoint_3000.pt +3 -0
- gpu_training/checkpoint_4000.pt +3 -0
- gpu_training/eval/.ipynb_checkpoints/evaluation_results-checkpoint.json +259 -0
- gpu_training/eval/.ipynb_checkpoints/summary-checkpoint.txt +18 -0
- gpu_training/eval/evaluation_results.json +70 -0
- gpu_training/eval/summary.txt +18 -0
- gpu_training/final_model.pt +3 -0
- gpu_training/steering/.ipynb_checkpoints/metadata-checkpoint.json +138 -0
- gpu_training/steering/metadata.json +138 -0
- gpu_training/steering/steering_vectors.pt +3 -0
- gpu_training_20250713_015232/steering/metadata.json +138 -0
- gpu_training_20250713_015232/steering/steering_vectors.pt +3 -0
- gpu_training_20250713_032436/steering/metadata.json +138 -0
- gpu_training_20250713_032436/steering/steering_vectors.pt +3 -0
- gpu_training_20250713_032744/steering/metadata.json +138 -0
- gpu_training_20250713_032744/steering/steering_vectors.pt +3 -0
- gpu_training_20250713_033240/best_model.pt +3 -0
- gpu_training_20250713_033240/checkpoint_0.pt +3 -0
- gpu_training_20250713_033240/checkpoint_1000.pt +3 -0
- gpu_training_20250713_033240/checkpoint_1500.pt +3 -0
- gpu_training_20250713_033240/checkpoint_2000.pt +3 -0
- gpu_training_20250713_033240/checkpoint_2500.pt +3 -0
- gpu_training_20250713_033240/checkpoint_3000.pt +3 -0
- gpu_training_20250713_033240/checkpoint_3500.pt +3 -0
- gpu_training_20250713_033240/checkpoint_4000.pt +3 -0
- gpu_training_20250713_033240/checkpoint_4500.pt +3 -0
- gpu_training_20250713_033240/checkpoint_500.pt +3 -0
- gpu_training_20250713_033240/final_model.pt +3 -0
- gpu_training_20250713_033240/steering/metadata.json +138 -0
- gpu_training_20250713_033240/steering/steering_vectors.pt +3 -0
- test_training/config.yaml +20 -0
gpu_training/best_model.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:688a56d0a26a85cd0803b7e2cbb05acc1b10e2127e1f28f4e8b3a8f48939121d
|
3 |
+
size 78711093
|
gpu_training/checkpoint_0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:af443be42ff0540419e818d2483aea756d099dbc224f0fee13d62efe1e63b48a
|
3 |
+
size 78711137
|
gpu_training/checkpoint_1000.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2b928ecb85ef3e6bab044a2b8dcf6b7b95db79bf2840647edee71d9589488509
|
3 |
+
size 78711203
|
gpu_training/checkpoint_2000.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:848923538b815fe33df8aa7784aeab07aec7ab8f38664af122fe8e8729abeec9
|
3 |
+
size 78711203
|
gpu_training/checkpoint_3000.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b84249e6037c914ec54f680914b568d42988f91ca4ee4b38d8b97e26a6337828
|
3 |
+
size 78711203
|
gpu_training/checkpoint_4000.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1c1b5fbcdbffe42f10f37889117d261fd0757308ba4bc7ce48f3966fd42b4740
|
3 |
+
size 78711203
|
gpu_training/eval/.ipynb_checkpoints/evaluation_results-checkpoint.json
ADDED
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"gsm8k": {
|
3 |
+
"fr_ponder": {
|
4 |
+
"accuracy": 0.0,
|
5 |
+
"efficiency": {
|
6 |
+
"mean_flops": 1548841728.0,
|
7 |
+
"median_flops": 1278950400.0,
|
8 |
+
"std_flops": 2059376643.3027253,
|
9 |
+
"min_flops": 26234880.0,
|
10 |
+
"max_flops": 11031767040.0,
|
11 |
+
"total_flops": 309768345600.0
|
12 |
+
},
|
13 |
+
"speedup": {
|
14 |
+
"mean_time": 3.2877051854133605,
|
15 |
+
"median_time": 3.692348003387451,
|
16 |
+
"std_time": 3.2731626526056754,
|
17 |
+
"min_time": 0.08106875419616699,
|
18 |
+
"max_time": 11.631380319595337,
|
19 |
+
"total_time": 657.5410370826721
|
20 |
+
},
|
21 |
+
"avg_flops": 1548841728.0,
|
22 |
+
"avg_time": 3.2877051854133605,
|
23 |
+
"avg_steps": 118.075
|
24 |
+
},
|
25 |
+
"baseline": {
|
26 |
+
"accuracy": 0.24,
|
27 |
+
"efficiency": {
|
28 |
+
"mean_flops": 1264988628340.475,
|
29 |
+
"median_flops": 1300493380979.0,
|
30 |
+
"std_flops": 56453805454.47149,
|
31 |
+
"min_flops": 1050444988777.0,
|
32 |
+
"max_flops": 1300493380979.0,
|
33 |
+
"total_flops": 252997725668095.0
|
34 |
+
},
|
35 |
+
"speedup": {
|
36 |
+
"mean_time": 5.5069497311115265,
|
37 |
+
"median_time": 5.29523766040802,
|
38 |
+
"std_time": 2.5041656924556133,
|
39 |
+
"min_time": 0.25672364234924316,
|
40 |
+
"max_time": 8.91803526878357,
|
41 |
+
"total_time": 1101.3899462223053
|
42 |
+
},
|
43 |
+
"avg_flops": 1264988628340.475,
|
44 |
+
"avg_time": 5.5069497311115265
|
45 |
+
},
|
46 |
+
"alpha_sweep": {
|
47 |
+
"0.1": {
|
48 |
+
"accuracy": 0.26,
|
49 |
+
"efficiency": {
|
50 |
+
"mean_flops": 316247157084.55,
|
51 |
+
"median_flops": 325123345244.0,
|
52 |
+
"std_flops": 14113451363.45452,
|
53 |
+
"min_flops": 262611247194.0,
|
54 |
+
"max_flops": 325123345244.0,
|
55 |
+
"total_flops": 63249431416910.0
|
56 |
+
},
|
57 |
+
"speedup": {
|
58 |
+
"mean_time": 5.520822087526321,
|
59 |
+
"median_time": 5.165250658988953,
|
60 |
+
"std_time": 2.544426104881166,
|
61 |
+
"min_time": 0.5056312084197998,
|
62 |
+
"max_time": 8.917860269546509,
|
63 |
+
"total_time": 1104.1644175052643
|
64 |
+
},
|
65 |
+
"avg_flops": 316247157084.55,
|
66 |
+
"avg_time": 5.520822087526321
|
67 |
+
},
|
68 |
+
"0.2": {
|
69 |
+
"accuracy": 0.31,
|
70 |
+
"efficiency": {
|
71 |
+
"mean_flops": 632494314169.865,
|
72 |
+
"median_flops": 650246690489.0,
|
73 |
+
"std_flops": 28226902727.115955,
|
74 |
+
"min_flops": 525222494388.0,
|
75 |
+
"max_flops": 650246690489.0,
|
76 |
+
"total_flops": 126498862833973.0
|
77 |
+
},
|
78 |
+
"speedup": {
|
79 |
+
"mean_time": 5.471761356592179,
|
80 |
+
"median_time": 5.289220690727234,
|
81 |
+
"std_time": 2.4624447347559575,
|
82 |
+
"min_time": 0.5140595436096191,
|
83 |
+
"max_time": 8.949026346206665,
|
84 |
+
"total_time": 1094.3522713184357
|
85 |
+
},
|
86 |
+
"avg_flops": 632494314169.865,
|
87 |
+
"avg_time": 5.471761356592179
|
88 |
+
},
|
89 |
+
"0.3": {
|
90 |
+
"accuracy": 0.235,
|
91 |
+
"efficiency": {
|
92 |
+
"mean_flops": 948741471255.16,
|
93 |
+
"median_flops": 975370035734.0,
|
94 |
+
"std_flops": 42340354090.81005,
|
95 |
+
"min_flops": 787833741583.0,
|
96 |
+
"max_flops": 975370035734.0,
|
97 |
+
"total_flops": 189748294251032.0
|
98 |
+
},
|
99 |
+
"speedup": {
|
100 |
+
"mean_time": 5.410509116649628,
|
101 |
+
"median_time": 5.308708548545837,
|
102 |
+
"std_time": 2.347776937087377,
|
103 |
+
"min_time": 0.259929895401001,
|
104 |
+
"max_time": 8.930791139602661,
|
105 |
+
"total_time": 1082.1018233299255
|
106 |
+
},
|
107 |
+
"avg_flops": 948741471255.16,
|
108 |
+
"avg_time": 5.410509116649628
|
109 |
+
},
|
110 |
+
"0.4": {
|
111 |
+
"accuracy": 0.29,
|
112 |
+
"efficiency": {
|
113 |
+
"mean_flops": 1264988628340.475,
|
114 |
+
"median_flops": 1300493380979.0,
|
115 |
+
"std_flops": 56453805454.47149,
|
116 |
+
"min_flops": 1050444988777.0,
|
117 |
+
"max_flops": 1300493380979.0,
|
118 |
+
"total_flops": 252997725668095.0
|
119 |
+
},
|
120 |
+
"speedup": {
|
121 |
+
"mean_time": 5.346057170629502,
|
122 |
+
"median_time": 4.7032036781311035,
|
123 |
+
"std_time": 2.4986757843866743,
|
124 |
+
"min_time": 1.421558141708374,
|
125 |
+
"max_time": 8.93062448501587,
|
126 |
+
"total_time": 1069.2114341259003
|
127 |
+
},
|
128 |
+
"avg_flops": 1264988628340.475,
|
129 |
+
"avg_time": 5.346057170629502
|
130 |
+
},
|
131 |
+
"0.5": {
|
132 |
+
"accuracy": 0.26,
|
133 |
+
"efficiency": {
|
134 |
+
"mean_flops": 1581235785425.93,
|
135 |
+
"median_flops": 1625616726224.0,
|
136 |
+
"std_flops": 70567256817.9984,
|
137 |
+
"min_flops": 1313056235972.0,
|
138 |
+
"max_flops": 1625616726224.0,
|
139 |
+
"total_flops": 316247157085186.0
|
140 |
+
},
|
141 |
+
"speedup": {
|
142 |
+
"mean_time": 5.563114120960235,
|
143 |
+
"median_time": 5.053737163543701,
|
144 |
+
"std_time": 2.602014242594124,
|
145 |
+
"min_time": 1.132164478302002,
|
146 |
+
"max_time": 14.432584524154663,
|
147 |
+
"total_time": 1112.6228241920471
|
148 |
+
},
|
149 |
+
"avg_flops": 1581235785425.93,
|
150 |
+
"avg_time": 5.563114120960235
|
151 |
+
},
|
152 |
+
"0.6": {
|
153 |
+
"accuracy": 0.26,
|
154 |
+
"efficiency": {
|
155 |
+
"mean_flops": 1897482942510.48,
|
156 |
+
"median_flops": 1950740071468.0,
|
157 |
+
"std_flops": 84680708181.45291,
|
158 |
+
"min_flops": 1575667483166.0,
|
159 |
+
"max_flops": 1950740071468.0,
|
160 |
+
"total_flops": 379496588502096.0
|
161 |
+
},
|
162 |
+
"speedup": {
|
163 |
+
"mean_time": 5.5442887151241305,
|
164 |
+
"median_time": 5.442851781845093,
|
165 |
+
"std_time": 2.3669791337173227,
|
166 |
+
"min_time": 0.6514637470245361,
|
167 |
+
"max_time": 9.174688816070557,
|
168 |
+
"total_time": 1108.857743024826
|
169 |
+
},
|
170 |
+
"avg_flops": 1897482942510.48,
|
171 |
+
"avg_time": 5.5442887151241305
|
172 |
+
},
|
173 |
+
"0.7": {
|
174 |
+
"accuracy": 0.24,
|
175 |
+
"efficiency": {
|
176 |
+
"mean_flops": 2213730099595.73,
|
177 |
+
"median_flops": 2275863416713.0,
|
178 |
+
"std_flops": 98794159545.19882,
|
179 |
+
"min_flops": 1838278730360.0,
|
180 |
+
"max_flops": 2275863416713.0,
|
181 |
+
"total_flops": 442746019919146.0
|
182 |
+
},
|
183 |
+
"speedup": {
|
184 |
+
"mean_time": 5.172487876415253,
|
185 |
+
"median_time": 4.997930645942688,
|
186 |
+
"std_time": 2.330700061672161,
|
187 |
+
"min_time": 1.2688794136047363,
|
188 |
+
"max_time": 8.9685537815094,
|
189 |
+
"total_time": 1034.4975752830505
|
190 |
+
},
|
191 |
+
"avg_flops": 2213730099595.73,
|
192 |
+
"avg_time": 5.172487876415253
|
193 |
+
},
|
194 |
+
"0.8": {
|
195 |
+
"accuracy": 0.29,
|
196 |
+
"efficiency": {
|
197 |
+
"mean_flops": 2529977256681.09,
|
198 |
+
"median_flops": 2600986761958.0,
|
199 |
+
"std_flops": 112907610908.80844,
|
200 |
+
"min_flops": 2100889977555.0,
|
201 |
+
"max_flops": 2600986761958.0,
|
202 |
+
"total_flops": 505995451336218.0
|
203 |
+
},
|
204 |
+
"speedup": {
|
205 |
+
"mean_time": 5.460249330997467,
|
206 |
+
"median_time": 5.191383481025696,
|
207 |
+
"std_time": 2.320604780645787,
|
208 |
+
"min_time": 0.48939085006713867,
|
209 |
+
"max_time": 8.930345058441162,
|
210 |
+
"total_time": 1092.0498661994934
|
211 |
+
},
|
212 |
+
"avg_flops": 2529977256681.09,
|
213 |
+
"avg_time": 5.460249330997467
|
214 |
+
},
|
215 |
+
"0.9": {
|
216 |
+
"accuracy": 0.29,
|
217 |
+
"efficiency": {
|
218 |
+
"mean_flops": 2846224413766.405,
|
219 |
+
"median_flops": 2926110107203.0,
|
220 |
+
"std_flops": 127021062272.46988,
|
221 |
+
"min_flops": 2363501224749.0,
|
222 |
+
"max_flops": 2926110107203.0,
|
223 |
+
"total_flops": 569244882753281.0
|
224 |
+
},
|
225 |
+
"speedup": {
|
226 |
+
"mean_time": 5.496963980197907,
|
227 |
+
"median_time": 5.254109740257263,
|
228 |
+
"std_time": 2.4695009865065334,
|
229 |
+
"min_time": 0.5802221298217773,
|
230 |
+
"max_time": 8.95212435722351,
|
231 |
+
"total_time": 1099.3927960395813
|
232 |
+
},
|
233 |
+
"avg_flops": 2846224413766.405,
|
234 |
+
"avg_time": 5.496963980197907
|
235 |
+
},
|
236 |
+
"1.0": {
|
237 |
+
"accuracy": 0.28,
|
238 |
+
"efficiency": {
|
239 |
+
"mean_flops": 3162471570851.86,
|
240 |
+
"median_flops": 3251233452448.0,
|
241 |
+
"std_flops": 141134513635.9968,
|
242 |
+
"min_flops": 2626112471944.0,
|
243 |
+
"max_flops": 3251233452448.0,
|
244 |
+
"total_flops": 632494314170372.0
|
245 |
+
},
|
246 |
+
"speedup": {
|
247 |
+
"mean_time": 5.370402137041092,
|
248 |
+
"median_time": 5.241830468177795,
|
249 |
+
"std_time": 2.5148718604922156,
|
250 |
+
"min_time": 0.1468040943145752,
|
251 |
+
"max_time": 14.612837076187134,
|
252 |
+
"total_time": 1074.0804274082184
|
253 |
+
},
|
254 |
+
"avg_flops": 3162471570851.86,
|
255 |
+
"avg_time": 5.370402137041092
|
256 |
+
}
|
257 |
+
}
|
258 |
+
}
|
259 |
+
}
|
gpu_training/eval/.ipynb_checkpoints/summary-checkpoint.txt
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FR-Ponder Evaluation Results
|
2 |
+
==============================
|
3 |
+
|
4 |
+
Dataset: gsm8k
|
5 |
+
--------------------
|
6 |
+
FR-Ponder:
|
7 |
+
Accuracy: 0.000
|
8 |
+
Avg FLOPs: 1548841728
|
9 |
+
Avg Time: 3.288s
|
10 |
+
Avg Steps: 118.1
|
11 |
+
Baseline (α=0.4):
|
12 |
+
Accuracy: 0.240
|
13 |
+
Avg FLOPs: 1264988628340
|
14 |
+
Avg Time: 5.507s
|
15 |
+
Improvements:
|
16 |
+
Accuracy: +-0.240
|
17 |
+
FLOPs reduction: 99.9%
|
18 |
+
Speedup: 1.68x
|
gpu_training/eval/evaluation_results.json
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"gsm8k": {
|
3 |
+
"fr_ponder": {
|
4 |
+
"accuracy": 0.0,
|
5 |
+
"efficiency": {
|
6 |
+
"mean_flops": 1368148992.0,
|
7 |
+
"median_flops": 1554416640.0,
|
8 |
+
"std_flops": 1617190973.1658711,
|
9 |
+
"min_flops": 26234880.0,
|
10 |
+
"max_flops": 8591923200.0,
|
11 |
+
"total_flops": 136814899200.0
|
12 |
+
},
|
13 |
+
"speedup": {
|
14 |
+
"mean_time": 3.323502082824707,
|
15 |
+
"median_time": 4.929964542388916,
|
16 |
+
"std_time": 3.102591789855366,
|
17 |
+
"min_time": 0.08603239059448242,
|
18 |
+
"max_time": 9.481685638427734,
|
19 |
+
"total_time": 332.3502082824707
|
20 |
+
},
|
21 |
+
"avg_flops": 1368148992.0,
|
22 |
+
"avg_time": 3.323502082824707,
|
23 |
+
"avg_steps": 104.3
|
24 |
+
},
|
25 |
+
"baseline": {
|
26 |
+
"accuracy": 0.25,
|
27 |
+
"efficiency": {
|
28 |
+
"mean_flops": 1259616826016.48,
|
29 |
+
"median_flops": 1300493380979.0,
|
30 |
+
"std_flops": 61378209282.38763,
|
31 |
+
"min_flops": 1050444988777.0,
|
32 |
+
"max_flops": 1300493380979.0,
|
33 |
+
"total_flops": 125961682601648.0
|
34 |
+
},
|
35 |
+
"speedup": {
|
36 |
+
"mean_time": 5.597262227535248,
|
37 |
+
"median_time": 5.318989992141724,
|
38 |
+
"std_time": 2.637104224359403,
|
39 |
+
"min_time": 0.9660005569458008,
|
40 |
+
"max_time": 9.206240177154541,
|
41 |
+
"total_time": 559.7262227535248
|
42 |
+
},
|
43 |
+
"avg_flops": 1259616826016.48,
|
44 |
+
"avg_time": 5.597262227535248
|
45 |
+
},
|
46 |
+
"alpha_sweep": {
|
47 |
+
"0.1": {
|
48 |
+
"accuracy": 0.25,
|
49 |
+
"efficiency": {
|
50 |
+
"mean_flops": 314904206503.56,
|
51 |
+
"median_flops": 325123345244.0,
|
52 |
+
"std_flops": 15344552320.43663,
|
53 |
+
"min_flops": 262611247194.0,
|
54 |
+
"max_flops": 325123345244.0,
|
55 |
+
"total_flops": 31490420650356.0
|
56 |
+
},
|
57 |
+
"speedup": {
|
58 |
+
"mean_time": 5.451106414794922,
|
59 |
+
"median_time": 5.010058760643005,
|
60 |
+
"std_time": 2.636969014137301,
|
61 |
+
"min_time": 1.0382819175720215,
|
62 |
+
"max_time": 9.177106380462646,
|
63 |
+
"total_time": 545.1106414794922
|
64 |
+
},
|
65 |
+
"avg_flops": 314904206503.56,
|
66 |
+
"avg_time": 5.451106414794922
|
67 |
+
}
|
68 |
+
}
|
69 |
+
}
|
70 |
+
}
|
gpu_training/eval/summary.txt
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FR-Ponder Evaluation Results
|
2 |
+
==============================
|
3 |
+
|
4 |
+
Dataset: gsm8k
|
5 |
+
--------------------
|
6 |
+
FR-Ponder:
|
7 |
+
Accuracy: 0.000
|
8 |
+
Avg FLOPs: 1368148992
|
9 |
+
Avg Time: 3.324s
|
10 |
+
Avg Steps: 104.3
|
11 |
+
Baseline (α=0.4):
|
12 |
+
Accuracy: 0.250
|
13 |
+
Avg FLOPs: 1259616826016
|
14 |
+
Avg Time: 5.597s
|
15 |
+
Improvements:
|
16 |
+
Accuracy: +-0.250
|
17 |
+
FLOPs reduction: 99.9%
|
18 |
+
Speedup: 1.68x
|
gpu_training/final_model.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dc472ffda208b92c7857cb93c678981d07992e0ee79ce12ac7c09f818effafef
|
3 |
+
size 78711115
|
gpu_training/steering/.ipynb_checkpoints/metadata-checkpoint.json
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_name": "microsoft/phi-2",
|
3 |
+
"positive_prompt": "Think step-by-step",
|
4 |
+
"negative_prompt": "Direct answer",
|
5 |
+
"layer_indices": [
|
6 |
+
0,
|
7 |
+
1,
|
8 |
+
2,
|
9 |
+
3,
|
10 |
+
4,
|
11 |
+
5,
|
12 |
+
6,
|
13 |
+
7,
|
14 |
+
8,
|
15 |
+
9,
|
16 |
+
10,
|
17 |
+
11,
|
18 |
+
12,
|
19 |
+
13,
|
20 |
+
14,
|
21 |
+
15,
|
22 |
+
16,
|
23 |
+
17,
|
24 |
+
18,
|
25 |
+
19,
|
26 |
+
20,
|
27 |
+
21,
|
28 |
+
22,
|
29 |
+
23,
|
30 |
+
24,
|
31 |
+
25,
|
32 |
+
26,
|
33 |
+
27,
|
34 |
+
28,
|
35 |
+
29,
|
36 |
+
30,
|
37 |
+
31
|
38 |
+
],
|
39 |
+
"num_samples": 500,
|
40 |
+
"vector_shapes": {
|
41 |
+
"0": [
|
42 |
+
2560
|
43 |
+
],
|
44 |
+
"1": [
|
45 |
+
2560
|
46 |
+
],
|
47 |
+
"2": [
|
48 |
+
2560
|
49 |
+
],
|
50 |
+
"3": [
|
51 |
+
2560
|
52 |
+
],
|
53 |
+
"4": [
|
54 |
+
2560
|
55 |
+
],
|
56 |
+
"5": [
|
57 |
+
2560
|
58 |
+
],
|
59 |
+
"6": [
|
60 |
+
2560
|
61 |
+
],
|
62 |
+
"7": [
|
63 |
+
2560
|
64 |
+
],
|
65 |
+
"8": [
|
66 |
+
2560
|
67 |
+
],
|
68 |
+
"9": [
|
69 |
+
2560
|
70 |
+
],
|
71 |
+
"10": [
|
72 |
+
2560
|
73 |
+
],
|
74 |
+
"11": [
|
75 |
+
2560
|
76 |
+
],
|
77 |
+
"12": [
|
78 |
+
2560
|
79 |
+
],
|
80 |
+
"13": [
|
81 |
+
2560
|
82 |
+
],
|
83 |
+
"14": [
|
84 |
+
2560
|
85 |
+
],
|
86 |
+
"15": [
|
87 |
+
2560
|
88 |
+
],
|
89 |
+
"16": [
|
90 |
+
2560
|
91 |
+
],
|
92 |
+
"17": [
|
93 |
+
2560
|
94 |
+
],
|
95 |
+
"18": [
|
96 |
+
2560
|
97 |
+
],
|
98 |
+
"19": [
|
99 |
+
2560
|
100 |
+
],
|
101 |
+
"20": [
|
102 |
+
2560
|
103 |
+
],
|
104 |
+
"21": [
|
105 |
+
2560
|
106 |
+
],
|
107 |
+
"22": [
|
108 |
+
2560
|
109 |
+
],
|
110 |
+
"23": [
|
111 |
+
2560
|
112 |
+
],
|
113 |
+
"24": [
|
114 |
+
2560
|
115 |
+
],
|
116 |
+
"25": [
|
117 |
+
2560
|
118 |
+
],
|
119 |
+
"26": [
|
120 |
+
2560
|
121 |
+
],
|
122 |
+
"27": [
|
123 |
+
2560
|
124 |
+
],
|
125 |
+
"28": [
|
126 |
+
2560
|
127 |
+
],
|
128 |
+
"29": [
|
129 |
+
2560
|
130 |
+
],
|
131 |
+
"30": [
|
132 |
+
2560
|
133 |
+
],
|
134 |
+
"31": [
|
135 |
+
2560
|
136 |
+
]
|
137 |
+
}
|
138 |
+
}
|
gpu_training/steering/metadata.json
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_name": "microsoft/phi-2",
|
3 |
+
"positive_prompt": "Think step-by-step",
|
4 |
+
"negative_prompt": "Direct answer",
|
5 |
+
"layer_indices": [
|
6 |
+
0,
|
7 |
+
1,
|
8 |
+
2,
|
9 |
+
3,
|
10 |
+
4,
|
11 |
+
5,
|
12 |
+
6,
|
13 |
+
7,
|
14 |
+
8,
|
15 |
+
9,
|
16 |
+
10,
|
17 |
+
11,
|
18 |
+
12,
|
19 |
+
13,
|
20 |
+
14,
|
21 |
+
15,
|
22 |
+
16,
|
23 |
+
17,
|
24 |
+
18,
|
25 |
+
19,
|
26 |
+
20,
|
27 |
+
21,
|
28 |
+
22,
|
29 |
+
23,
|
30 |
+
24,
|
31 |
+
25,
|
32 |
+
26,
|
33 |
+
27,
|
34 |
+
28,
|
35 |
+
29,
|
36 |
+
30,
|
37 |
+
31
|
38 |
+
],
|
39 |
+
"num_samples": 500,
|
40 |
+
"vector_shapes": {
|
41 |
+
"0": [
|
42 |
+
2560
|
43 |
+
],
|
44 |
+
"1": [
|
45 |
+
2560
|
46 |
+
],
|
47 |
+
"2": [
|
48 |
+
2560
|
49 |
+
],
|
50 |
+
"3": [
|
51 |
+
2560
|
52 |
+
],
|
53 |
+
"4": [
|
54 |
+
2560
|
55 |
+
],
|
56 |
+
"5": [
|
57 |
+
2560
|
58 |
+
],
|
59 |
+
"6": [
|
60 |
+
2560
|
61 |
+
],
|
62 |
+
"7": [
|
63 |
+
2560
|
64 |
+
],
|
65 |
+
"8": [
|
66 |
+
2560
|
67 |
+
],
|
68 |
+
"9": [
|
69 |
+
2560
|
70 |
+
],
|
71 |
+
"10": [
|
72 |
+
2560
|
73 |
+
],
|
74 |
+
"11": [
|
75 |
+
2560
|
76 |
+
],
|
77 |
+
"12": [
|
78 |
+
2560
|
79 |
+
],
|
80 |
+
"13": [
|
81 |
+
2560
|
82 |
+
],
|
83 |
+
"14": [
|
84 |
+
2560
|
85 |
+
],
|
86 |
+
"15": [
|
87 |
+
2560
|
88 |
+
],
|
89 |
+
"16": [
|
90 |
+
2560
|
91 |
+
],
|
92 |
+
"17": [
|
93 |
+
2560
|
94 |
+
],
|
95 |
+
"18": [
|
96 |
+
2560
|
97 |
+
],
|
98 |
+
"19": [
|
99 |
+
2560
|
100 |
+
],
|
101 |
+
"20": [
|
102 |
+
2560
|
103 |
+
],
|
104 |
+
"21": [
|
105 |
+
2560
|
106 |
+
],
|
107 |
+
"22": [
|
108 |
+
2560
|
109 |
+
],
|
110 |
+
"23": [
|
111 |
+
2560
|
112 |
+
],
|
113 |
+
"24": [
|
114 |
+
2560
|
115 |
+
],
|
116 |
+
"25": [
|
117 |
+
2560
|
118 |
+
],
|
119 |
+
"26": [
|
120 |
+
2560
|
121 |
+
],
|
122 |
+
"27": [
|
123 |
+
2560
|
124 |
+
],
|
125 |
+
"28": [
|
126 |
+
2560
|
127 |
+
],
|
128 |
+
"29": [
|
129 |
+
2560
|
130 |
+
],
|
131 |
+
"30": [
|
132 |
+
2560
|
133 |
+
],
|
134 |
+
"31": [
|
135 |
+
2560
|
136 |
+
]
|
137 |
+
}
|
138 |
+
}
|
gpu_training/steering/steering_vectors.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bac198bb75f847f9bc6af2e86fdce994463d1db08b9cc42ed8c06beb05b52d0a
|
3 |
+
size 173273
|
gpu_training_20250713_015232/steering/metadata.json
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_name": "microsoft/phi-2",
|
3 |
+
"positive_prompt": "Think step-by-step",
|
4 |
+
"negative_prompt": "Direct answer",
|
5 |
+
"layer_indices": [
|
6 |
+
0,
|
7 |
+
1,
|
8 |
+
2,
|
9 |
+
3,
|
10 |
+
4,
|
11 |
+
5,
|
12 |
+
6,
|
13 |
+
7,
|
14 |
+
8,
|
15 |
+
9,
|
16 |
+
10,
|
17 |
+
11,
|
18 |
+
12,
|
19 |
+
13,
|
20 |
+
14,
|
21 |
+
15,
|
22 |
+
16,
|
23 |
+
17,
|
24 |
+
18,
|
25 |
+
19,
|
26 |
+
20,
|
27 |
+
21,
|
28 |
+
22,
|
29 |
+
23,
|
30 |
+
24,
|
31 |
+
25,
|
32 |
+
26,
|
33 |
+
27,
|
34 |
+
28,
|
35 |
+
29,
|
36 |
+
30,
|
37 |
+
31
|
38 |
+
],
|
39 |
+
"num_samples": 200,
|
40 |
+
"vector_shapes": {
|
41 |
+
"0": [
|
42 |
+
2560
|
43 |
+
],
|
44 |
+
"1": [
|
45 |
+
2560
|
46 |
+
],
|
47 |
+
"2": [
|
48 |
+
2560
|
49 |
+
],
|
50 |
+
"3": [
|
51 |
+
2560
|
52 |
+
],
|
53 |
+
"4": [
|
54 |
+
2560
|
55 |
+
],
|
56 |
+
"5": [
|
57 |
+
2560
|
58 |
+
],
|
59 |
+
"6": [
|
60 |
+
2560
|
61 |
+
],
|
62 |
+
"7": [
|
63 |
+
2560
|
64 |
+
],
|
65 |
+
"8": [
|
66 |
+
2560
|
67 |
+
],
|
68 |
+
"9": [
|
69 |
+
2560
|
70 |
+
],
|
71 |
+
"10": [
|
72 |
+
2560
|
73 |
+
],
|
74 |
+
"11": [
|
75 |
+
2560
|
76 |
+
],
|
77 |
+
"12": [
|
78 |
+
2560
|
79 |
+
],
|
80 |
+
"13": [
|
81 |
+
2560
|
82 |
+
],
|
83 |
+
"14": [
|
84 |
+
2560
|
85 |
+
],
|
86 |
+
"15": [
|
87 |
+
2560
|
88 |
+
],
|
89 |
+
"16": [
|
90 |
+
2560
|
91 |
+
],
|
92 |
+
"17": [
|
93 |
+
2560
|
94 |
+
],
|
95 |
+
"18": [
|
96 |
+
2560
|
97 |
+
],
|
98 |
+
"19": [
|
99 |
+
2560
|
100 |
+
],
|
101 |
+
"20": [
|
102 |
+
2560
|
103 |
+
],
|
104 |
+
"21": [
|
105 |
+
2560
|
106 |
+
],
|
107 |
+
"22": [
|
108 |
+
2560
|
109 |
+
],
|
110 |
+
"23": [
|
111 |
+
2560
|
112 |
+
],
|
113 |
+
"24": [
|
114 |
+
2560
|
115 |
+
],
|
116 |
+
"25": [
|
117 |
+
2560
|
118 |
+
],
|
119 |
+
"26": [
|
120 |
+
2560
|
121 |
+
],
|
122 |
+
"27": [
|
123 |
+
2560
|
124 |
+
],
|
125 |
+
"28": [
|
126 |
+
2560
|
127 |
+
],
|
128 |
+
"29": [
|
129 |
+
2560
|
130 |
+
],
|
131 |
+
"30": [
|
132 |
+
2560
|
133 |
+
],
|
134 |
+
"31": [
|
135 |
+
2560
|
136 |
+
]
|
137 |
+
}
|
138 |
+
}
|
gpu_training_20250713_015232/steering/steering_vectors.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bac198bb75f847f9bc6af2e86fdce994463d1db08b9cc42ed8c06beb05b52d0a
|
3 |
+
size 173273
|
gpu_training_20250713_032436/steering/metadata.json
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_name": "microsoft/phi-2",
|
3 |
+
"positive_prompt": "Think step-by-step",
|
4 |
+
"negative_prompt": "Direct answer",
|
5 |
+
"layer_indices": [
|
6 |
+
0,
|
7 |
+
1,
|
8 |
+
2,
|
9 |
+
3,
|
10 |
+
4,
|
11 |
+
5,
|
12 |
+
6,
|
13 |
+
7,
|
14 |
+
8,
|
15 |
+
9,
|
16 |
+
10,
|
17 |
+
11,
|
18 |
+
12,
|
19 |
+
13,
|
20 |
+
14,
|
21 |
+
15,
|
22 |
+
16,
|
23 |
+
17,
|
24 |
+
18,
|
25 |
+
19,
|
26 |
+
20,
|
27 |
+
21,
|
28 |
+
22,
|
29 |
+
23,
|
30 |
+
24,
|
31 |
+
25,
|
32 |
+
26,
|
33 |
+
27,
|
34 |
+
28,
|
35 |
+
29,
|
36 |
+
30,
|
37 |
+
31
|
38 |
+
],
|
39 |
+
"num_samples": 200,
|
40 |
+
"vector_shapes": {
|
41 |
+
"0": [
|
42 |
+
2560
|
43 |
+
],
|
44 |
+
"1": [
|
45 |
+
2560
|
46 |
+
],
|
47 |
+
"2": [
|
48 |
+
2560
|
49 |
+
],
|
50 |
+
"3": [
|
51 |
+
2560
|
52 |
+
],
|
53 |
+
"4": [
|
54 |
+
2560
|
55 |
+
],
|
56 |
+
"5": [
|
57 |
+
2560
|
58 |
+
],
|
59 |
+
"6": [
|
60 |
+
2560
|
61 |
+
],
|
62 |
+
"7": [
|
63 |
+
2560
|
64 |
+
],
|
65 |
+
"8": [
|
66 |
+
2560
|
67 |
+
],
|
68 |
+
"9": [
|
69 |
+
2560
|
70 |
+
],
|
71 |
+
"10": [
|
72 |
+
2560
|
73 |
+
],
|
74 |
+
"11": [
|
75 |
+
2560
|
76 |
+
],
|
77 |
+
"12": [
|
78 |
+
2560
|
79 |
+
],
|
80 |
+
"13": [
|
81 |
+
2560
|
82 |
+
],
|
83 |
+
"14": [
|
84 |
+
2560
|
85 |
+
],
|
86 |
+
"15": [
|
87 |
+
2560
|
88 |
+
],
|
89 |
+
"16": [
|
90 |
+
2560
|
91 |
+
],
|
92 |
+
"17": [
|
93 |
+
2560
|
94 |
+
],
|
95 |
+
"18": [
|
96 |
+
2560
|
97 |
+
],
|
98 |
+
"19": [
|
99 |
+
2560
|
100 |
+
],
|
101 |
+
"20": [
|
102 |
+
2560
|
103 |
+
],
|
104 |
+
"21": [
|
105 |
+
2560
|
106 |
+
],
|
107 |
+
"22": [
|
108 |
+
2560
|
109 |
+
],
|
110 |
+
"23": [
|
111 |
+
2560
|
112 |
+
],
|
113 |
+
"24": [
|
114 |
+
2560
|
115 |
+
],
|
116 |
+
"25": [
|
117 |
+
2560
|
118 |
+
],
|
119 |
+
"26": [
|
120 |
+
2560
|
121 |
+
],
|
122 |
+
"27": [
|
123 |
+
2560
|
124 |
+
],
|
125 |
+
"28": [
|
126 |
+
2560
|
127 |
+
],
|
128 |
+
"29": [
|
129 |
+
2560
|
130 |
+
],
|
131 |
+
"30": [
|
132 |
+
2560
|
133 |
+
],
|
134 |
+
"31": [
|
135 |
+
2560
|
136 |
+
]
|
137 |
+
}
|
138 |
+
}
|
gpu_training_20250713_032436/steering/steering_vectors.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bac198bb75f847f9bc6af2e86fdce994463d1db08b9cc42ed8c06beb05b52d0a
|
3 |
+
size 173273
|
gpu_training_20250713_032744/steering/metadata.json
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_name": "microsoft/phi-2",
|
3 |
+
"positive_prompt": "Think step-by-step",
|
4 |
+
"negative_prompt": "Direct answer",
|
5 |
+
"layer_indices": [
|
6 |
+
0,
|
7 |
+
1,
|
8 |
+
2,
|
9 |
+
3,
|
10 |
+
4,
|
11 |
+
5,
|
12 |
+
6,
|
13 |
+
7,
|
14 |
+
8,
|
15 |
+
9,
|
16 |
+
10,
|
17 |
+
11,
|
18 |
+
12,
|
19 |
+
13,
|
20 |
+
14,
|
21 |
+
15,
|
22 |
+
16,
|
23 |
+
17,
|
24 |
+
18,
|
25 |
+
19,
|
26 |
+
20,
|
27 |
+
21,
|
28 |
+
22,
|
29 |
+
23,
|
30 |
+
24,
|
31 |
+
25,
|
32 |
+
26,
|
33 |
+
27,
|
34 |
+
28,
|
35 |
+
29,
|
36 |
+
30,
|
37 |
+
31
|
38 |
+
],
|
39 |
+
"num_samples": 200,
|
40 |
+
"vector_shapes": {
|
41 |
+
"0": [
|
42 |
+
2560
|
43 |
+
],
|
44 |
+
"1": [
|
45 |
+
2560
|
46 |
+
],
|
47 |
+
"2": [
|
48 |
+
2560
|
49 |
+
],
|
50 |
+
"3": [
|
51 |
+
2560
|
52 |
+
],
|
53 |
+
"4": [
|
54 |
+
2560
|
55 |
+
],
|
56 |
+
"5": [
|
57 |
+
2560
|
58 |
+
],
|
59 |
+
"6": [
|
60 |
+
2560
|
61 |
+
],
|
62 |
+
"7": [
|
63 |
+
2560
|
64 |
+
],
|
65 |
+
"8": [
|
66 |
+
2560
|
67 |
+
],
|
68 |
+
"9": [
|
69 |
+
2560
|
70 |
+
],
|
71 |
+
"10": [
|
72 |
+
2560
|
73 |
+
],
|
74 |
+
"11": [
|
75 |
+
2560
|
76 |
+
],
|
77 |
+
"12": [
|
78 |
+
2560
|
79 |
+
],
|
80 |
+
"13": [
|
81 |
+
2560
|
82 |
+
],
|
83 |
+
"14": [
|
84 |
+
2560
|
85 |
+
],
|
86 |
+
"15": [
|
87 |
+
2560
|
88 |
+
],
|
89 |
+
"16": [
|
90 |
+
2560
|
91 |
+
],
|
92 |
+
"17": [
|
93 |
+
2560
|
94 |
+
],
|
95 |
+
"18": [
|
96 |
+
2560
|
97 |
+
],
|
98 |
+
"19": [
|
99 |
+
2560
|
100 |
+
],
|
101 |
+
"20": [
|
102 |
+
2560
|
103 |
+
],
|
104 |
+
"21": [
|
105 |
+
2560
|
106 |
+
],
|
107 |
+
"22": [
|
108 |
+
2560
|
109 |
+
],
|
110 |
+
"23": [
|
111 |
+
2560
|
112 |
+
],
|
113 |
+
"24": [
|
114 |
+
2560
|
115 |
+
],
|
116 |
+
"25": [
|
117 |
+
2560
|
118 |
+
],
|
119 |
+
"26": [
|
120 |
+
2560
|
121 |
+
],
|
122 |
+
"27": [
|
123 |
+
2560
|
124 |
+
],
|
125 |
+
"28": [
|
126 |
+
2560
|
127 |
+
],
|
128 |
+
"29": [
|
129 |
+
2560
|
130 |
+
],
|
131 |
+
"30": [
|
132 |
+
2560
|
133 |
+
],
|
134 |
+
"31": [
|
135 |
+
2560
|
136 |
+
]
|
137 |
+
}
|
138 |
+
}
|
gpu_training_20250713_032744/steering/steering_vectors.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bac198bb75f847f9bc6af2e86fdce994463d1db08b9cc42ed8c06beb05b52d0a
|
3 |
+
size 173273
|
gpu_training_20250713_033240/best_model.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6b2b8b264b481c80321b472a015526a6f1568ae5a9c803e64b01d3831b804234
|
3 |
+
size 34999
|
gpu_training_20250713_033240/checkpoint_0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:49ec15a40042441da104a18c39f176444df227cfd6c5cb778e5a017378ca3401
|
3 |
+
size 35027
|
gpu_training_20250713_033240/checkpoint_1000.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:21296f1af03377cff6d38d4d013e088c0ca1f176851efe4ee78137dc101060d0
|
3 |
+
size 35069
|
gpu_training_20250713_033240/checkpoint_1500.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:80561d8da90baad0faf297824f652fa2c0e9ad70c74baf8888adde51b0928942
|
3 |
+
size 35069
|
gpu_training_20250713_033240/checkpoint_2000.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:69fe1baa2fc525b66688aef7ae9ed73b78bcc85a2b3d728b84d309fc6b6e3b95
|
3 |
+
size 35069
|
gpu_training_20250713_033240/checkpoint_2500.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cb8abdcdf4b763446c4492eb3a3b8dd39612ae9090c3dbc704d31bb93a93d0a8
|
3 |
+
size 35069
|
gpu_training_20250713_033240/checkpoint_3000.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4ac5de4471b67f24536f5a0a89fb982b21bd1d6c821bec2b6a47752f791e5ade
|
3 |
+
size 35069
|
gpu_training_20250713_033240/checkpoint_3500.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:88a8249a0039cd77449f0e96f4c48211c109ac39d1e58ba03faf771be61dce50
|
3 |
+
size 35069
|
gpu_training_20250713_033240/checkpoint_4000.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:01beb87333b14a081390eb537c3b07e8a062813b1720b5ae083ae9fa39e16ce5
|
3 |
+
size 35069
|
gpu_training_20250713_033240/checkpoint_4500.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4b72e5b7ad37abac40dde41f23ab7f2fde514d6d8f51d45ab90b3cff173255a9
|
3 |
+
size 35069
|
gpu_training_20250713_033240/checkpoint_500.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8c3d08cca746929883326ac36055ec13bbd7c5e64d7a7512bfe86495f6c21829
|
3 |
+
size 35055
|
gpu_training_20250713_033240/final_model.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7caff9c0fe2f9b3bf912ee52396392d53eaa8c817f78dadbd80f6fa884a128d1
|
3 |
+
size 35013
|
gpu_training_20250713_033240/steering/metadata.json
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_name": "microsoft/phi-2",
|
3 |
+
"positive_prompt": "Think step-by-step",
|
4 |
+
"negative_prompt": "Direct answer",
|
5 |
+
"layer_indices": [
|
6 |
+
0,
|
7 |
+
1,
|
8 |
+
2,
|
9 |
+
3,
|
10 |
+
4,
|
11 |
+
5,
|
12 |
+
6,
|
13 |
+
7,
|
14 |
+
8,
|
15 |
+
9,
|
16 |
+
10,
|
17 |
+
11,
|
18 |
+
12,
|
19 |
+
13,
|
20 |
+
14,
|
21 |
+
15,
|
22 |
+
16,
|
23 |
+
17,
|
24 |
+
18,
|
25 |
+
19,
|
26 |
+
20,
|
27 |
+
21,
|
28 |
+
22,
|
29 |
+
23,
|
30 |
+
24,
|
31 |
+
25,
|
32 |
+
26,
|
33 |
+
27,
|
34 |
+
28,
|
35 |
+
29,
|
36 |
+
30,
|
37 |
+
31
|
38 |
+
],
|
39 |
+
"num_samples": 200,
|
40 |
+
"vector_shapes": {
|
41 |
+
"0": [
|
42 |
+
2560
|
43 |
+
],
|
44 |
+
"1": [
|
45 |
+
2560
|
46 |
+
],
|
47 |
+
"2": [
|
48 |
+
2560
|
49 |
+
],
|
50 |
+
"3": [
|
51 |
+
2560
|
52 |
+
],
|
53 |
+
"4": [
|
54 |
+
2560
|
55 |
+
],
|
56 |
+
"5": [
|
57 |
+
2560
|
58 |
+
],
|
59 |
+
"6": [
|
60 |
+
2560
|
61 |
+
],
|
62 |
+
"7": [
|
63 |
+
2560
|
64 |
+
],
|
65 |
+
"8": [
|
66 |
+
2560
|
67 |
+
],
|
68 |
+
"9": [
|
69 |
+
2560
|
70 |
+
],
|
71 |
+
"10": [
|
72 |
+
2560
|
73 |
+
],
|
74 |
+
"11": [
|
75 |
+
2560
|
76 |
+
],
|
77 |
+
"12": [
|
78 |
+
2560
|
79 |
+
],
|
80 |
+
"13": [
|
81 |
+
2560
|
82 |
+
],
|
83 |
+
"14": [
|
84 |
+
2560
|
85 |
+
],
|
86 |
+
"15": [
|
87 |
+
2560
|
88 |
+
],
|
89 |
+
"16": [
|
90 |
+
2560
|
91 |
+
],
|
92 |
+
"17": [
|
93 |
+
2560
|
94 |
+
],
|
95 |
+
"18": [
|
96 |
+
2560
|
97 |
+
],
|
98 |
+
"19": [
|
99 |
+
2560
|
100 |
+
],
|
101 |
+
"20": [
|
102 |
+
2560
|
103 |
+
],
|
104 |
+
"21": [
|
105 |
+
2560
|
106 |
+
],
|
107 |
+
"22": [
|
108 |
+
2560
|
109 |
+
],
|
110 |
+
"23": [
|
111 |
+
2560
|
112 |
+
],
|
113 |
+
"24": [
|
114 |
+
2560
|
115 |
+
],
|
116 |
+
"25": [
|
117 |
+
2560
|
118 |
+
],
|
119 |
+
"26": [
|
120 |
+
2560
|
121 |
+
],
|
122 |
+
"27": [
|
123 |
+
2560
|
124 |
+
],
|
125 |
+
"28": [
|
126 |
+
2560
|
127 |
+
],
|
128 |
+
"29": [
|
129 |
+
2560
|
130 |
+
],
|
131 |
+
"30": [
|
132 |
+
2560
|
133 |
+
],
|
134 |
+
"31": [
|
135 |
+
2560
|
136 |
+
]
|
137 |
+
}
|
138 |
+
}
|
gpu_training_20250713_033240/steering/steering_vectors.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bac198bb75f847f9bc6af2e86fdce994463d1db08b9cc42ed8c06beb05b52d0a
|
3 |
+
size 173273
|
test_training/config.yaml
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
batch_size: 2
|
2 |
+
controller_dropout: 0.1
|
3 |
+
controller_layers: 2
|
4 |
+
controller_lr: 0.0001
|
5 |
+
dataset_name: gsm8k
|
6 |
+
epochs: 1
|
7 |
+
eval_baseline: true
|
8 |
+
eval_interval: 1
|
9 |
+
eval_samples: 5
|
10 |
+
hidden_size: 2560
|
11 |
+
lambda_accuracy: 1.0
|
12 |
+
lambda_flops: 0.005
|
13 |
+
log_interval: 10
|
14 |
+
max_ponder_steps: 3
|
15 |
+
model_name: microsoft/phi-2
|
16 |
+
output_dir: outputs/test_training
|
17 |
+
save_interval: 1
|
18 |
+
threshold: 0.3
|
19 |
+
train_samples: 10
|
20 |
+
warmup_steps: 100
|