TroyDoesAI
commited on
1 - Epoch of BlackSheep Persona with Training Logs
Browse files- .gitattributes +1 -0
- Training_BlackSheep_MoE/Epoch_1/Epoch_1-BlackSheep-with-Personas.png +0 -0
- Training_BlackSheep_MoE/Epoch_1/Epoch_1_WandDB/files/conda-environment.yaml +380 -0
- Training_BlackSheep_MoE/Epoch_1/Epoch_1_WandDB/files/wandb-metadata.json +53 -0
- Training_BlackSheep_MoE/Epoch_1/Epoch_1_WandDB/logs/debug-internal.log +0 -0
- Training_BlackSheep_MoE/Epoch_1/Epoch_1_WandDB/logs/debug.log +28 -0
- Training_BlackSheep_MoE/Epoch_1/Epoch_1_WandDB/run-zd7c97g6.wandb +3 -0
- Training_BlackSheep_MoE/Epoch_1/MIXTRAL-training_log_Epoch1-BlackSheep-with-Personas.txt +288 -0
- config.json +2 -2
- generation_config.json +1 -1
- model-00001-of-00004.safetensors +3 -0
- model-00002-of-00004.safetensors +3 -0
- model-00003-of-00004.safetensors +3 -0
- model-00004-of-00004.safetensors +3 -0
- model.safetensors.index.json +522 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
Training_BlackSheep_MoE/Epoch_1/Epoch_1_WandDB/run-zd7c97g6.wandb filter=lfs diff=lfs merge=lfs -text
|
Training_BlackSheep_MoE/Epoch_1/Epoch_1-BlackSheep-with-Personas.png
ADDED
Training_BlackSheep_MoE/Epoch_1/Epoch_1_WandDB/files/conda-environment.yaml
ADDED
@@ -0,0 +1,380 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: C:\Users\Administrator\Desktop\text-generation-webui-main\installer_files\env
|
2 |
+
channels:
|
3 |
+
- defaults
|
4 |
+
dependencies:
|
5 |
+
- bzip2=1.0.8=h2bbff1b_6
|
6 |
+
- ca-certificates=2024.7.2=haa95532_0
|
7 |
+
- git=2.45.2=haa95532_0
|
8 |
+
- libffi=3.4.4=hd77b12b_1
|
9 |
+
- ninja-base=1.10.2=h6d14046_5
|
10 |
+
- openssl=3.0.14=h827c3e9_0
|
11 |
+
- pip=24.0=py311haa95532_0
|
12 |
+
- python=3.11.9=he1021f5_0
|
13 |
+
- setuptools=69.5.1=py311haa95532_0
|
14 |
+
- sqlite=3.45.3=h2bbff1b_0
|
15 |
+
- tk=8.6.14=h0416ee5_0
|
16 |
+
- vc=14.2=h2eaa2aa_4
|
17 |
+
- vs2015_runtime=14.29.30133=h43f2093_4
|
18 |
+
- wheel=0.43.0=py311haa95532_0
|
19 |
+
- xz=5.4.6=h8cc25b3_1
|
20 |
+
- zlib=1.2.13=h8cc25b3_1
|
21 |
+
- pip:
|
22 |
+
- absl-py==2.1.0
|
23 |
+
- accelerate==0.33.0
|
24 |
+
- aiofiles==23.2.1
|
25 |
+
- aiohappyeyeballs==2.4.3
|
26 |
+
- aiohttp==3.10.8
|
27 |
+
- aiosignal==1.3.1
|
28 |
+
- alembic==1.13.2
|
29 |
+
- altair==5.3.0
|
30 |
+
- annotated-types==0.7.0
|
31 |
+
- anthropic==0.36.0
|
32 |
+
- anyio==4.4.0
|
33 |
+
- apscheduler==3.10.4
|
34 |
+
- argon2-cffi==23.1.0
|
35 |
+
- argon2-cffi-bindings==21.2.0
|
36 |
+
- asgiref==3.8.1
|
37 |
+
- attrs==23.2.0
|
38 |
+
- authlib==1.3.2
|
39 |
+
- auto-gptq==0.7.1
|
40 |
+
- autoawq==0.2.6
|
41 |
+
- autoawq-kernels==0.0.7
|
42 |
+
- av==12.3.0
|
43 |
+
- backoff==2.2.1
|
44 |
+
- bcrypt==4.2.0
|
45 |
+
- beautifulsoup4==4.12.3
|
46 |
+
- bidict==0.23.1
|
47 |
+
- bitarray==2.9.3
|
48 |
+
- bitsandbytes==0.43.3
|
49 |
+
- black==24.8.0
|
50 |
+
- blinker==1.8.2
|
51 |
+
- boto3==1.35.0
|
52 |
+
- botocore==1.35.39
|
53 |
+
- build==1.2.2.post1
|
54 |
+
- cachetools==5.5.0
|
55 |
+
- certifi==2024.7.4
|
56 |
+
- cffi==1.17.1
|
57 |
+
- chardet==5.2.0
|
58 |
+
- charset-normalizer==3.3.2
|
59 |
+
- chroma-hnswlib==0.7.6
|
60 |
+
- chromadb==0.5.9
|
61 |
+
- click==8.1.7
|
62 |
+
- colbert-ai==0.2.21
|
63 |
+
- colorama==0.4.6
|
64 |
+
- colorclass==2.2.2
|
65 |
+
- coloredlogs==15.0.1
|
66 |
+
- compressed-rtf==1.0.6
|
67 |
+
- comtypes==1.4.7
|
68 |
+
- contourpy==1.2.1
|
69 |
+
- cramjam==2.8.3
|
70 |
+
- cryptography==43.0.1
|
71 |
+
- ctranslate2==4.4.0
|
72 |
+
- cycler==0.12.1
|
73 |
+
- dataclasses-json==0.6.7
|
74 |
+
- dataproperty==1.0.1
|
75 |
+
- datasets==2.20.0
|
76 |
+
- defusedxml==0.7.1
|
77 |
+
- deprecated==1.2.14
|
78 |
+
- dill==0.3.8
|
79 |
+
- diskcache==5.6.3
|
80 |
+
- distro==1.9.0
|
81 |
+
- dnspython==2.6.1
|
82 |
+
- docker==7.1.0
|
83 |
+
- docker-pycreds==0.4.0
|
84 |
+
- docx2txt==0.8
|
85 |
+
- duckduckgo-search==6.2.13
|
86 |
+
- durationpy==0.9
|
87 |
+
- easygui==0.98.3
|
88 |
+
- ebcdic==1.1.1
|
89 |
+
- ecdsa==0.19.0
|
90 |
+
- einops==0.8.0
|
91 |
+
- elevenlabs==1.9.0
|
92 |
+
- email-validator==2.2.0
|
93 |
+
- emoji==2.14.0
|
94 |
+
- environs==9.5.0
|
95 |
+
- et-xmlfile==1.1.0
|
96 |
+
- eval-type-backport==0.2.0
|
97 |
+
- exllamav2==0.1.8+cu121.torch2.2.2
|
98 |
+
- extract-msg==0.51.1
|
99 |
+
- fake-useragent==1.5.1
|
100 |
+
- fastapi==0.111.0
|
101 |
+
- fastapi-cli==0.0.4
|
102 |
+
- faster-whisper==1.0.3
|
103 |
+
- fastparquet==2024.5.0
|
104 |
+
- ffmpy==0.3.2
|
105 |
+
- filelock==3.13.1
|
106 |
+
- filetype==1.2.0
|
107 |
+
- flash-attn==2.6.1
|
108 |
+
- flask==3.0.3
|
109 |
+
- flask-cloudflared==0.0.14
|
110 |
+
- flask-cors==5.0.0
|
111 |
+
- flask-sqlalchemy==3.1.1
|
112 |
+
- flatbuffers==24.3.25
|
113 |
+
- fonttools==4.53.1
|
114 |
+
- fpdf2==2.7.9
|
115 |
+
- frozenlist==1.4.1
|
116 |
+
- fsspec==2024.2.0
|
117 |
+
- ftfy==6.2.3
|
118 |
+
- gekko==1.2.1
|
119 |
+
- gguf==0.9.1
|
120 |
+
- git-python==1.0.3
|
121 |
+
- gitdb==4.0.11
|
122 |
+
- gitpython==3.1.43
|
123 |
+
- google-ai-generativelanguage==0.6.6
|
124 |
+
- google-api-core==2.21.0
|
125 |
+
- google-api-python-client==2.149.0
|
126 |
+
- google-auth==2.35.0
|
127 |
+
- google-auth-httplib2==0.2.0
|
128 |
+
- google-generativeai==0.7.2
|
129 |
+
- googleapis-common-protos==1.65.0
|
130 |
+
- gradio==4.26.0
|
131 |
+
- gradio-client==0.15.1
|
132 |
+
- greenlet==3.1.1
|
133 |
+
- grpcio==1.65.1
|
134 |
+
- grpcio-status==1.62.3
|
135 |
+
- h11==0.14.0
|
136 |
+
- halo==0.0.31
|
137 |
+
- hqq==0.1.7.post3
|
138 |
+
- httpcore==1.0.5
|
139 |
+
- httplib2==0.22.0
|
140 |
+
- httptools==0.6.1
|
141 |
+
- httpx==0.27.0
|
142 |
+
- huggingface-hub==0.24.0
|
143 |
+
- humanfriendly==10.0
|
144 |
+
- idna==3.7
|
145 |
+
- importlib-metadata==8.4.0
|
146 |
+
- importlib-resources==6.4.0
|
147 |
+
- iniconfig==2.0.0
|
148 |
+
- itsdangerous==2.2.0
|
149 |
+
- jinja2==3.1.4
|
150 |
+
- jiter==0.6.1
|
151 |
+
- jmespath==1.0.1
|
152 |
+
- joblib==1.4.2
|
153 |
+
- jsonl2json==1.0.0
|
154 |
+
- jsonlines==4.0.0
|
155 |
+
- jsonpatch==1.33
|
156 |
+
- jsonpath-python==1.0.6
|
157 |
+
- jsonpointer==3.0.0
|
158 |
+
- jsonschema==4.23.0
|
159 |
+
- jsonschema-specifications==2023.12.1
|
160 |
+
- keyboard==0.13.5
|
161 |
+
- kiwisolver==1.4.5
|
162 |
+
- kubernetes==31.0.0
|
163 |
+
- langchain==0.2.15
|
164 |
+
- langchain-chroma==0.1.4
|
165 |
+
- langchain-community==0.2.12
|
166 |
+
- langchain-core==0.2.41
|
167 |
+
- langchain-text-splitters==0.2.4
|
168 |
+
- langdetect==1.0.9
|
169 |
+
- langfuse==2.44.0
|
170 |
+
- langsmith==0.1.134
|
171 |
+
- lark==1.1.9
|
172 |
+
- llama-cpp-python==0.2.89+cpuavx2
|
173 |
+
- llama-cpp-python-cuda==0.2.89+cu121
|
174 |
+
- llama-cpp-python-cuda-tensorcores==0.2.89+cu121
|
175 |
+
- llvmlite==0.42.0
|
176 |
+
- lm-eval==0.3.0
|
177 |
+
- log-symbols==0.0.14
|
178 |
+
- lxml==5.3.0
|
179 |
+
- mako==1.3.5
|
180 |
+
- markdown==3.7
|
181 |
+
- markdown-it-py==3.0.0
|
182 |
+
- markupsafe==2.1.5
|
183 |
+
- marshmallow==3.22.0
|
184 |
+
- matplotlib==3.9.1
|
185 |
+
- mbstrdecoder==1.1.3
|
186 |
+
- mdurl==0.1.2
|
187 |
+
- mmh3==5.0.1
|
188 |
+
- monotonic==1.6
|
189 |
+
- mpmath==1.3.0
|
190 |
+
- msoffcrypto-tool==5.4.2
|
191 |
+
- multidict==6.0.5
|
192 |
+
- multiprocess==0.70.16
|
193 |
+
- mypy-extensions==1.0.0
|
194 |
+
- nest-asyncio==1.6.0
|
195 |
+
- networkx==3.2.1
|
196 |
+
- ninja==1.11.1.1
|
197 |
+
- nltk==3.9.1
|
198 |
+
- numba==0.59.1
|
199 |
+
- numexpr==2.10.1
|
200 |
+
- numpy==1.26.4
|
201 |
+
- oauthlib==3.2.2
|
202 |
+
- olefile==0.47
|
203 |
+
- oletools==0.60.2
|
204 |
+
- onnxruntime==1.19.2
|
205 |
+
- open-webui==0.3.32
|
206 |
+
- openai==1.37.0
|
207 |
+
- opencv-python==4.10.0.84
|
208 |
+
- opencv-python-headless==4.10.0.84
|
209 |
+
- openpyxl==3.1.5
|
210 |
+
- opentelemetry-api==1.27.0
|
211 |
+
- opentelemetry-exporter-otlp-proto-common==1.27.0
|
212 |
+
- opentelemetry-exporter-otlp-proto-grpc==1.27.0
|
213 |
+
- opentelemetry-instrumentation==0.48b0
|
214 |
+
- opentelemetry-instrumentation-asgi==0.48b0
|
215 |
+
- opentelemetry-instrumentation-fastapi==0.48b0
|
216 |
+
- opentelemetry-proto==1.27.0
|
217 |
+
- opentelemetry-sdk==1.27.0
|
218 |
+
- opentelemetry-semantic-conventions==0.48b0
|
219 |
+
- opentelemetry-util-http==0.48b0
|
220 |
+
- optimum==1.17.1
|
221 |
+
- orjson==3.10.6
|
222 |
+
- overrides==7.7.0
|
223 |
+
- packaging==23.2
|
224 |
+
- pandas==2.2.3
|
225 |
+
- passlib==1.7.4
|
226 |
+
- pathspec==0.12.1
|
227 |
+
- pathvalidate==3.2.0
|
228 |
+
- pcodedmp==1.2.6
|
229 |
+
- peewee==3.17.6
|
230 |
+
- peewee-migrate==1.12.2
|
231 |
+
- peft==0.12.0
|
232 |
+
- pillow==10.4.0
|
233 |
+
- platformdirs==4.2.2
|
234 |
+
- pluggy==1.5.0
|
235 |
+
- portalocker==2.10.1
|
236 |
+
- posthog==3.7.0
|
237 |
+
- primp==0.6.3
|
238 |
+
- propcache==0.2.0
|
239 |
+
- proto-plus==1.24.0
|
240 |
+
- protobuf==4.25.3
|
241 |
+
- psutil==6.0.0
|
242 |
+
- psycopg2-binary==2.9.9
|
243 |
+
- py-cpuinfo==9.0.0
|
244 |
+
- pyarrow==17.0.0
|
245 |
+
- pyarrow-hotfix==0.6
|
246 |
+
- pyasn1==0.6.1
|
247 |
+
- pyasn1-modules==0.4.1
|
248 |
+
- pyaudio==0.2.14
|
249 |
+
- pybind11==2.13.1
|
250 |
+
- pycaw==20240210
|
251 |
+
- pyclipper==1.3.0.post5
|
252 |
+
- pycountry==24.6.1
|
253 |
+
- pycparser==2.22
|
254 |
+
- pydantic==2.9.2
|
255 |
+
- pydantic-core==2.23.4
|
256 |
+
- pydub==0.25.1
|
257 |
+
- pygame==2.6.1
|
258 |
+
- pygments==2.18.0
|
259 |
+
- pyjwt==2.9.0
|
260 |
+
- pymilvus==2.4.7
|
261 |
+
- pymongo==4.10.1
|
262 |
+
- pymysql==1.1.1
|
263 |
+
- pypandoc==1.13
|
264 |
+
- pyparsing==3.1.2
|
265 |
+
- pypdf==4.3.1
|
266 |
+
- pypika==0.48.9
|
267 |
+
- pyproject-hooks==1.2.0
|
268 |
+
- pyqt5==5.15.11
|
269 |
+
- pyqt5-qt5==5.15.2
|
270 |
+
- pyqt5-sip==12.15.0
|
271 |
+
- pyreadline3==3.4.1
|
272 |
+
- pytablewriter==1.2.0
|
273 |
+
- pytest==8.3.3
|
274 |
+
- pytest-docker==3.1.1
|
275 |
+
- python-dateutil==2.8.2
|
276 |
+
- python-dotenv==1.0.1
|
277 |
+
- python-engineio==4.9.1
|
278 |
+
- python-iso639==2024.4.27
|
279 |
+
- python-jose==3.3.0
|
280 |
+
- python-magic==0.4.27
|
281 |
+
- python-multipart==0.0.9
|
282 |
+
- python-oxmsg==0.0.1
|
283 |
+
- python-pptx==1.0.0
|
284 |
+
- python-socketio==5.11.3
|
285 |
+
- pytube==15.0.0
|
286 |
+
- pytz==2024.1
|
287 |
+
- pywin32==306
|
288 |
+
- pyxlsb==1.0.10
|
289 |
+
- pyyaml==6.0.1
|
290 |
+
- rank-bm25==0.2.2
|
291 |
+
- rapidfuzz==3.10.0
|
292 |
+
- rapidocr-onnxruntime==1.3.24
|
293 |
+
- red-black-tree-mod==1.20
|
294 |
+
- redis==5.1.1
|
295 |
+
- referencing==0.35.1
|
296 |
+
- regex==2024.5.15
|
297 |
+
- requests==2.32.3
|
298 |
+
- requests-oauthlib==2.0.0
|
299 |
+
- requests-toolbelt==1.0.0
|
300 |
+
- rich==13.7.1
|
301 |
+
- rouge==1.0.1
|
302 |
+
- rouge-score==0.1.2
|
303 |
+
- rpds-py==0.19.0
|
304 |
+
- rsa==4.9
|
305 |
+
- rtfde==0.1.2
|
306 |
+
- ruff==0.5.4
|
307 |
+
- s3transfer==0.10.3
|
308 |
+
- sacrebleu==1.5.0
|
309 |
+
- safetensors==0.4.3
|
310 |
+
- scikit-learn==1.5.1
|
311 |
+
- scipy==1.14.0
|
312 |
+
- semantic-version==2.10.0
|
313 |
+
- sentence-transformers==3.0.1
|
314 |
+
- sentencepiece==0.2.0
|
315 |
+
- sentry-sdk==2.10.0
|
316 |
+
- setproctitle==1.3.3
|
317 |
+
- shapely==2.0.6
|
318 |
+
- shellingham==1.5.4
|
319 |
+
- simple-websocket==1.1.0
|
320 |
+
- six==1.16.0
|
321 |
+
- smmap==5.0.1
|
322 |
+
- sniffio==1.3.1
|
323 |
+
- soupsieve==2.6
|
324 |
+
- speechrecognition==3.10.0
|
325 |
+
- spinners==0.0.24
|
326 |
+
- sqlalchemy==2.0.32
|
327 |
+
- sqlitedict==2.1.0
|
328 |
+
- sse-starlette==1.6.5
|
329 |
+
- starlette==0.37.2
|
330 |
+
- sympy==1.12
|
331 |
+
- tabledata==1.3.3
|
332 |
+
- tabulate==0.9.0
|
333 |
+
- tcolorpy==0.1.6
|
334 |
+
- tenacity==8.5.0
|
335 |
+
- tensorboard==2.17.0
|
336 |
+
- tensorboard-data-server==0.7.2
|
337 |
+
- termcolor==2.4.0
|
338 |
+
- threadpoolctl==3.5.0
|
339 |
+
- tiktoken==0.7.0
|
340 |
+
- timm==1.0.8
|
341 |
+
- tokenizers==0.19.1
|
342 |
+
- tomlkit==0.12.0
|
343 |
+
- toolz==0.12.1
|
344 |
+
- torch==2.2.2+cu121
|
345 |
+
- torchaudio==2.2.2+cu121
|
346 |
+
- torchvision==0.17.2+cu121
|
347 |
+
- tqdm==4.66.4
|
348 |
+
- tqdm-multiprocess==0.0.11
|
349 |
+
- transformers==4.44.2
|
350 |
+
- typepy==1.3.2
|
351 |
+
- typer==0.12.3
|
352 |
+
- typing-extensions==4.9.0
|
353 |
+
- typing-inspect==0.9.0
|
354 |
+
- tzdata==2024.1
|
355 |
+
- tzlocal==5.2
|
356 |
+
- ujson==5.10.0
|
357 |
+
- unstructured==0.15.9
|
358 |
+
- unstructured-client==0.26.1
|
359 |
+
- uritemplate==4.1.1
|
360 |
+
- urllib3==2.2.2
|
361 |
+
- uvicorn==0.30.6
|
362 |
+
- validators==0.33.0
|
363 |
+
- wandb==0.17.5
|
364 |
+
- watchfiles==0.22.0
|
365 |
+
- wcwidth==0.2.13
|
366 |
+
- webrtcvad==2.0.10
|
367 |
+
- websocket-client==1.8.0
|
368 |
+
- websockets==11.0.3
|
369 |
+
- werkzeug==3.0.3
|
370 |
+
- win-unicode-console==0.5
|
371 |
+
- wrapt==1.16.0
|
372 |
+
- wsproto==1.2.0
|
373 |
+
- xlrd==2.0.1
|
374 |
+
- xlsxwriter==3.2.0
|
375 |
+
- xxhash==3.4.1
|
376 |
+
- yarl==1.15.1
|
377 |
+
- youtube-transcript-api==0.6.2
|
378 |
+
- zipp==3.20.2
|
379 |
+
- zstandard==0.23.0
|
380 |
+
prefix: C:\Users\Administrator\Desktop\text-generation-webui-main\installer_files\env
|
Training_BlackSheep_MoE/Epoch_1/Epoch_1_WandDB/files/wandb-metadata.json
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Windows-10-10.0.22621-SP0",
|
3 |
+
"python": "3.11.9",
|
4 |
+
"heartbeatAt": "2024-10-17T19:57:13.102016",
|
5 |
+
"startedAt": "2024-10-17T19:57:13.021163",
|
6 |
+
"docker": null,
|
7 |
+
"cuda": null,
|
8 |
+
"args": [],
|
9 |
+
"state": "running",
|
10 |
+
"program": "C:\\Users\\Administrator\\Desktop\\text-generation-webui-main\\server.py",
|
11 |
+
"codePathLocal": "server.py",
|
12 |
+
"codePath": "server.py",
|
13 |
+
"git": {
|
14 |
+
"remote": "https://github.com/oobabooga/text-generation-webui",
|
15 |
+
"commit": "5522584992c632d75d2389e9342793fd1dcc391d"
|
16 |
+
},
|
17 |
+
"email": "[email protected]",
|
18 |
+
"root": "C:/Users/Administrator/Desktop/text-generation-webui-main",
|
19 |
+
"host": "Troy-New-PC",
|
20 |
+
"username": "Troy",
|
21 |
+
"executable": "C:\\Users\\Administrator\\Desktop\\text-generation-webui-main\\installer_files\\env\\python.exe",
|
22 |
+
"cpu_count": 12,
|
23 |
+
"cpu_count_logical": 20,
|
24 |
+
"cpu_freq": {
|
25 |
+
"current": 3600.0,
|
26 |
+
"min": 0.0,
|
27 |
+
"max": 3600.0
|
28 |
+
},
|
29 |
+
"cpu_freq_per_core": [
|
30 |
+
{
|
31 |
+
"current": 3600.0,
|
32 |
+
"min": 0.0,
|
33 |
+
"max": 3600.0
|
34 |
+
}
|
35 |
+
],
|
36 |
+
"disk": {
|
37 |
+
"/": {
|
38 |
+
"total": 930.6826133728027,
|
39 |
+
"used": 716.3763427734375
|
40 |
+
}
|
41 |
+
},
|
42 |
+
"gpu": "NVIDIA GeForce RTX 3090",
|
43 |
+
"gpu_count": 1,
|
44 |
+
"gpu_devices": [
|
45 |
+
{
|
46 |
+
"name": "NVIDIA GeForce RTX 3090",
|
47 |
+
"memory_total": 25769803776
|
48 |
+
}
|
49 |
+
],
|
50 |
+
"memory": {
|
51 |
+
"total": 31.863067626953125
|
52 |
+
}
|
53 |
+
}
|
Training_BlackSheep_MoE/Epoch_1/Epoch_1_WandDB/logs/debug-internal.log
ADDED
The diff for this file is too large to render.
See raw diff
|
|
Training_BlackSheep_MoE/Epoch_1/Epoch_1_WandDB/logs/debug.log
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-10-17 12:57:13,023 INFO Thread-7 (threaded_run):32648 [wandb_setup.py:_flush():76] Current SDK version is 0.17.5
|
2 |
+
2024-10-17 12:57:13,023 INFO Thread-7 (threaded_run):32648 [wandb_setup.py:_flush():76] Configure stats pid to 32648
|
3 |
+
2024-10-17 12:57:13,023 INFO Thread-7 (threaded_run):32648 [wandb_setup.py:_flush():76] Loading settings from C:\Users\Administrator\.config\wandb\settings
|
4 |
+
2024-10-17 12:57:13,023 INFO Thread-7 (threaded_run):32648 [wandb_setup.py:_flush():76] Loading settings from C:\Users\Administrator\Desktop\text-generation-webui-main\wandb\settings
|
5 |
+
2024-10-17 12:57:13,023 INFO Thread-7 (threaded_run):32648 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'mode': 'offline'}
|
6 |
+
2024-10-17 12:57:13,023 INFO Thread-7 (threaded_run):32648 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
|
7 |
+
2024-10-17 12:57:13,023 INFO Thread-7 (threaded_run):32648 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'server.py', 'program_abspath': 'C:\\Users\\Administrator\\Desktop\\text-generation-webui-main\\server.py', 'program': 'C:\\Users\\Administrator\\Desktop\\text-generation-webui-main\\server.py'}
|
8 |
+
2024-10-17 12:57:13,023 INFO Thread-7 (threaded_run):32648 [wandb_init.py:_log_setup():529] Logging user logs to C:\Users\Administrator\Desktop\text-generation-webui-main\wandb\offline-run-20241017_125713-zd7c97g6\logs\debug.log
|
9 |
+
2024-10-17 12:57:13,023 INFO Thread-7 (threaded_run):32648 [wandb_init.py:_log_setup():530] Logging internal logs to C:\Users\Administrator\Desktop\text-generation-webui-main\wandb\offline-run-20241017_125713-zd7c97g6\logs\debug-internal.log
|
10 |
+
2024-10-17 12:57:13,023 INFO Thread-7 (threaded_run):32648 [wandb_init.py:init():569] calling init triggers
|
11 |
+
2024-10-17 12:57:13,023 INFO Thread-7 (threaded_run):32648 [wandb_init.py:init():576] wandb.init called with sweep_config: {}
|
12 |
+
config: {}
|
13 |
+
2024-10-17 12:57:13,023 INFO Thread-7 (threaded_run):32648 [wandb_init.py:init():619] starting backend
|
14 |
+
2024-10-17 12:57:13,023 INFO Thread-7 (threaded_run):32648 [wandb_init.py:init():623] setting up manager
|
15 |
+
2024-10-17 12:57:13,024 INFO Thread-7 (threaded_run):32648 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=spawn, using: spawn
|
16 |
+
2024-10-17 12:57:13,025 INFO Thread-7 (threaded_run):32648 [wandb_init.py:init():631] backend started and connected
|
17 |
+
2024-10-17 12:57:13,028 INFO Thread-7 (threaded_run):32648 [wandb_init.py:init():720] updated telemetry
|
18 |
+
2024-10-17 12:57:13,052 INFO Thread-7 (threaded_run):32648 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
|
19 |
+
2024-10-17 12:57:13,054 INFO Thread-7 (threaded_run):32648 [wandb_init.py:init():804] starting run threads in backend
|
20 |
+
2024-10-17 12:57:16,864 INFO Thread-7 (threaded_run):32648 [wandb_run.py:_console_start():2413] atexit reg
|
21 |
+
2024-10-17 12:57:16,864 INFO Thread-7 (threaded_run):32648 [wandb_run.py:_redirect():2255] redirect: wrap_raw
|
22 |
+
2024-10-17 12:57:16,864 INFO Thread-7 (threaded_run):32648 [wandb_run.py:_redirect():2320] Wrapping output streams.
|
23 |
+
2024-10-17 12:57:16,864 INFO Thread-7 (threaded_run):32648 [wandb_run.py:_redirect():2345] Redirects installed.
|
24 |
+
2024-10-17 12:57:16,865 INFO Thread-7 (threaded_run):32648 [wandb_init.py:init():847] run started, returning control to user process
|
25 |
+
2024-10-17 12:57:16,867 INFO Thread-7 (threaded_run):32648 [wandb_run.py:_config_callback():1382] config_cb None None {'peft_config': {'default': {'peft_type': <PeftType.LORA: 'LORA'>, 'auto_mapping': None, 'base_model_name_or_path': 'models\\TroyDoesAI_ContextObedient-MoE', 'revision': None, 'task_type': 'CAUSAL_LM', 'inference_mode': False, 'r': 32, 'target_modules': {'q_proj', 'v_proj'}, 'lora_alpha': 64, 'lora_dropout': 0, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'loftq_config': {}, 'use_dora': False, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}}}, 'vocab_size': 32064, 'max_position_embeddings': 16384, 'hidden_size': 3072, 'intermediate_size': 8192, 'num_hidden_layers': 32, 'num_attention_heads': 32, 'sliding_window': None, 'num_key_value_heads': 32, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'use_cache': False, 'rope_theta': 10000.0, 'attention_dropout': 0.0, 'num_experts_per_tok': 2, 'num_local_experts': 3, 'output_router_logits': False, 'router_aux_loss_coef': 0.001, 'router_jitter_noise': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['MixtralForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 32000, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'models\\TroyDoesAI_ContextObedient-MoE', 'transformers_version': '4.44.2', 'attention_bias': False, 'mlp_bias': False, 'model_type': 'mixtral', 'pretraining_tp': 1, 'rope_scaling': None, 'quantization_config': {'quant_method': 'BITS_AND_BYTES', '_load_in_8bit': False, '_load_in_4bit': True, 'llm_int8_threshold': 6.0, 'llm_int8_skip_modules': None, 'llm_int8_enable_fp32_cpu_offload': True, 'llm_int8_has_fp16_weight': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'float16', 'bnb_4bit_quant_storage': 'uint8', 'load_in_4bit': True, 'load_in_8bit': False}, 'output_dir': 'loras/BlackSheep-Lora', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 2e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 6, 'max_steps': -1, 'lr_scheduler_type': 'polynomial', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 'loras/BlackSheep-Lora\\runs\\Oct17_12-57-11_Troy-New-PC', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 5, 'logging_nan_inf_filter': True, 'save_strategy': 'no', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 'loras/BlackSheep-Lora', 'disable_tqdm': True, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_bnb_8bit', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'no', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'eval_use_gather_object': False}
|
26 |
+
2024-10-17 12:57:16,870 INFO Thread-7 (threaded_run):32648 [wandb_config.py:__setitem__():151] config set model/num_parameters = 8665795584 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x00000214C0853850>>
|
27 |
+
2024-10-17 12:57:16,870 INFO Thread-7 (threaded_run):32648 [wandb_run.py:_config_callback():1382] config_cb model/num_parameters 8665795584 None
|
28 |
+
2024-10-17 18:00:28,321 WARNING MsgRouterThr:32648 [router.py:message_loop():77] message_loop has been closed
|
Training_BlackSheep_MoE/Epoch_1/Epoch_1_WandDB/run-zd7c97g6.wandb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:eaa1f304de6ed78411f4c9f14e9eedf8f4193726cc33705b574db125f45a5d3f
|
3 |
+
size 1038295
|
Training_BlackSheep_MoE/Epoch_1/MIXTRAL-training_log_Epoch1-BlackSheep-with-Personas.txt
ADDED
@@ -0,0 +1,288 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Step: 4 {'loss': 1.8393, 'grad_norm': 0.9135997295379639, 'learning_rate': 1.9988154761904764e-05, 'epoch': 0.0035714285714285713}
|
2 |
+
Step: 9 {'loss': 1.6701, 'grad_norm': 0.7541273832321167, 'learning_rate': 1.9976309523809527e-05, 'epoch': 0.007142857142857143}
|
3 |
+
Step: 14 {'loss': 1.2554, 'grad_norm': 0.6005926132202148, 'learning_rate': 1.9964464285714286e-05, 'epoch': 0.010714285714285714}
|
4 |
+
Step: 19 {'loss': 1.4734, 'grad_norm': 0.6956166625022888, 'learning_rate': 1.995261904761905e-05, 'epoch': 0.014285714285714285}
|
5 |
+
Step: 24 {'loss': 1.2668, 'grad_norm': 0.3963625133037567, 'learning_rate': 1.994077380952381e-05, 'epoch': 0.017857142857142856}
|
6 |
+
Step: 29 {'loss': 1.2668, 'grad_norm': 0.41620415449142456, 'learning_rate': 1.9928928571428574e-05, 'epoch': 0.02142857142857143}
|
7 |
+
Step: 34 {'loss': 1.4369, 'grad_norm': 0.7620450854301453, 'learning_rate': 1.9917083333333336e-05, 'epoch': 0.025}
|
8 |
+
Step: 39 {'loss': 1.3254, 'grad_norm': 0.6728664636611938, 'learning_rate': 1.99052380952381e-05, 'epoch': 0.02857142857142857}
|
9 |
+
Step: 44 {'loss': 1.0877, 'grad_norm': 0.5350437760353088, 'learning_rate': 1.9893392857142858e-05, 'epoch': 0.03214285714285714}
|
10 |
+
Step: 49 {'loss': 1.0392, 'grad_norm': 0.6517331600189209, 'learning_rate': 1.988154761904762e-05, 'epoch': 0.03571428571428571}
|
11 |
+
Step: 54 {'loss': 1.0966, 'grad_norm': 0.4985944628715515, 'learning_rate': 1.9869702380952383e-05, 'epoch': 0.039285714285714285}
|
12 |
+
Step: 59 {'loss': 1.0616, 'grad_norm': 0.6425554752349854, 'learning_rate': 1.9857857142857145e-05, 'epoch': 0.04285714285714286}
|
13 |
+
Step: 64 {'loss': 1.3173, 'grad_norm': 0.9711938500404358, 'learning_rate': 1.9846011904761908e-05, 'epoch': 0.04642857142857143}
|
14 |
+
Step: 69 {'loss': 0.965, 'grad_norm': 0.5132351517677307, 'learning_rate': 1.9834166666666667e-05, 'epoch': 0.05}
|
15 |
+
Step: 74 {'loss': 0.8959, 'grad_norm': 0.5035068988800049, 'learning_rate': 1.982232142857143e-05, 'epoch': 0.05357142857142857}
|
16 |
+
Step: 79 {'loss': 0.8977, 'grad_norm': 0.6544917821884155, 'learning_rate': 1.9810476190476192e-05, 'epoch': 0.05714285714285714}
|
17 |
+
Step: 84 {'loss': 0.8336, 'grad_norm': 0.7621152997016907, 'learning_rate': 1.9798630952380955e-05, 'epoch': 0.060714285714285714}
|
18 |
+
Step: 89 {'loss': 0.8617, 'grad_norm': 0.5246341228485107, 'learning_rate': 1.9786785714285717e-05, 'epoch': 0.06428571428571428}
|
19 |
+
Step: 94 {'loss': 0.7953, 'grad_norm': 0.5174001455307007, 'learning_rate': 1.977494047619048e-05, 'epoch': 0.06785714285714285}
|
20 |
+
Step: 99 {'loss': 1.0036, 'grad_norm': 0.8036244511604309, 'learning_rate': 1.976309523809524e-05, 'epoch': 0.07142857142857142}
|
21 |
+
Step: 104 {'loss': 0.9666, 'grad_norm': 0.8807173371315002, 'learning_rate': 1.975125e-05, 'epoch': 0.075}
|
22 |
+
Step: 109 {'loss': 0.7682, 'grad_norm': 0.6022303700447083, 'learning_rate': 1.9739404761904764e-05, 'epoch': 0.07857142857142857}
|
23 |
+
Step: 114 {'loss': 0.8332, 'grad_norm': 0.5524723529815674, 'learning_rate': 1.9727559523809527e-05, 'epoch': 0.08214285714285714}
|
24 |
+
Step: 119 {'loss': 0.6926, 'grad_norm': 0.6818602085113525, 'learning_rate': 1.971571428571429e-05, 'epoch': 0.08571428571428572}
|
25 |
+
Step: 124 {'loss': 0.7233, 'grad_norm': 0.5903816819190979, 'learning_rate': 1.9703869047619052e-05, 'epoch': 0.08928571428571429}
|
26 |
+
Step: 129 {'loss': 0.6642, 'grad_norm': 0.7943810820579529, 'learning_rate': 1.969202380952381e-05, 'epoch': 0.09285714285714286}
|
27 |
+
Step: 134 {'loss': 0.6425, 'grad_norm': 0.5778236985206604, 'learning_rate': 1.9680178571428574e-05, 'epoch': 0.09642857142857143}
|
28 |
+
Step: 139 {'loss': 0.6691, 'grad_norm': 0.5044384002685547, 'learning_rate': 1.9668333333333333e-05, 'epoch': 0.1}
|
29 |
+
Step: 144 {'loss': 0.6386, 'grad_norm': 0.6634503602981567, 'learning_rate': 1.9656488095238095e-05, 'epoch': 0.10357142857142858}
|
30 |
+
Step: 149 {'loss': 0.748, 'grad_norm': 0.653299868106842, 'learning_rate': 1.9644642857142858e-05, 'epoch': 0.10714285714285714}
|
31 |
+
Step: 154 {'loss': 0.6776, 'grad_norm': 0.6532788276672363, 'learning_rate': 1.963279761904762e-05, 'epoch': 0.11071428571428571}
|
32 |
+
Step: 159 {'loss': 0.4942, 'grad_norm': 0.6517465114593506, 'learning_rate': 1.9620952380952383e-05, 'epoch': 0.11428571428571428}
|
33 |
+
Step: 164 {'loss': 0.5849, 'grad_norm': 0.7105296850204468, 'learning_rate': 1.9609107142857142e-05, 'epoch': 0.11785714285714285}
|
34 |
+
Step: 169 {'loss': 0.5823, 'grad_norm': 0.8051909804344177, 'learning_rate': 1.9597261904761905e-05, 'epoch': 0.12142857142857143}
|
35 |
+
Step: 174 {'loss': 0.6124, 'grad_norm': 0.901710569858551, 'learning_rate': 1.9585416666666667e-05, 'epoch': 0.125}
|
36 |
+
Step: 179 {'loss': 0.7032, 'grad_norm': 0.8329763412475586, 'learning_rate': 1.957357142857143e-05, 'epoch': 0.12857142857142856}
|
37 |
+
Step: 184 {'loss': 0.5445, 'grad_norm': 0.41105031967163086, 'learning_rate': 1.9561726190476192e-05, 'epoch': 0.13214285714285715}
|
38 |
+
Step: 189 {'loss': 0.6622, 'grad_norm': 0.9969037175178528, 'learning_rate': 1.9549880952380955e-05, 'epoch': 0.1357142857142857}
|
39 |
+
Step: 194 {'loss': 0.6171, 'grad_norm': 0.6689130067825317, 'learning_rate': 1.9538035714285714e-05, 'epoch': 0.1392857142857143}
|
40 |
+
Step: 199 {'loss': 0.5791, 'grad_norm': 0.9522849917411804, 'learning_rate': 1.9526190476190477e-05, 'epoch': 0.14285714285714285}
|
41 |
+
Step: 204 {'loss': 0.7005, 'grad_norm': 0.6476870179176331, 'learning_rate': 1.951434523809524e-05, 'epoch': 0.14642857142857144}
|
42 |
+
Step: 209 {'loss': 0.548, 'grad_norm': 0.6938184499740601, 'learning_rate': 1.9502500000000002e-05, 'epoch': 0.15}
|
43 |
+
Step: 214 {'loss': 0.5984, 'grad_norm': 0.7758791446685791, 'learning_rate': 1.9490654761904764e-05, 'epoch': 0.15357142857142858}
|
44 |
+
Step: 219 {'loss': 0.5668, 'grad_norm': 0.6298092007637024, 'learning_rate': 1.9478809523809523e-05, 'epoch': 0.15714285714285714}
|
45 |
+
Step: 224 {'loss': 0.4565, 'grad_norm': 0.8767203688621521, 'learning_rate': 1.9466964285714286e-05, 'epoch': 0.16071428571428573}
|
46 |
+
Step: 229 {'loss': 0.4918, 'grad_norm': 0.8556920886039734, 'learning_rate': 1.945511904761905e-05, 'epoch': 0.16428571428571428}
|
47 |
+
Step: 234 {'loss': 0.6559, 'grad_norm': 0.9740686416625977, 'learning_rate': 1.944327380952381e-05, 'epoch': 0.16785714285714284}
|
48 |
+
Step: 239 {'loss': 0.5291, 'grad_norm': 1.2440892457962036, 'learning_rate': 1.9431428571428574e-05, 'epoch': 0.17142857142857143}
|
49 |
+
Step: 244 {'loss': 0.4741, 'grad_norm': 0.8751192092895508, 'learning_rate': 1.9419583333333336e-05, 'epoch': 0.175}
|
50 |
+
Step: 249 {'loss': 0.4616, 'grad_norm': 0.6261155009269714, 'learning_rate': 1.9407738095238095e-05, 'epoch': 0.17857142857142858}
|
51 |
+
Step: 254 {'loss': 0.6365, 'grad_norm': 0.7985801696777344, 'learning_rate': 1.9395892857142858e-05, 'epoch': 0.18214285714285713}
|
52 |
+
Step: 259 {'loss': 0.4893, 'grad_norm': 0.8446434140205383, 'learning_rate': 1.938404761904762e-05, 'epoch': 0.18571428571428572}
|
53 |
+
Step: 264 {'loss': 0.4726, 'grad_norm': 0.998410701751709, 'learning_rate': 1.9372202380952383e-05, 'epoch': 0.18928571428571428}
|
54 |
+
Step: 269 {'loss': 0.4389, 'grad_norm': 0.7421302199363708, 'learning_rate': 1.9360357142857146e-05, 'epoch': 0.19285714285714287}
|
55 |
+
Step: 274 {'loss': 0.3953, 'grad_norm': 0.7805348038673401, 'learning_rate': 1.9348511904761905e-05, 'epoch': 0.19642857142857142}
|
56 |
+
Step: 279 {'loss': 0.5141, 'grad_norm': 1.2035925388336182, 'learning_rate': 1.9336666666666667e-05, 'epoch': 0.2}
|
57 |
+
Step: 284 {'loss': 0.5336, 'grad_norm': 0.8996376395225525, 'learning_rate': 1.932482142857143e-05, 'epoch': 0.20357142857142857}
|
58 |
+
Step: 289 {'loss': 0.5258, 'grad_norm': 0.6805949211120605, 'learning_rate': 1.9312976190476192e-05, 'epoch': 0.20714285714285716}
|
59 |
+
Step: 294 {'loss': 0.4903, 'grad_norm': 0.7106399536132812, 'learning_rate': 1.9301130952380955e-05, 'epoch': 0.21071428571428572}
|
60 |
+
Step: 299 {'loss': 0.3474, 'grad_norm': 0.5261926054954529, 'learning_rate': 1.9289285714285718e-05, 'epoch': 0.21428571428571427}
|
61 |
+
Step: 304 {'loss': 0.5147, 'grad_norm': 0.8087174892425537, 'learning_rate': 1.9277440476190477e-05, 'epoch': 0.21785714285714286}
|
62 |
+
Step: 309 {'loss': 0.387, 'grad_norm': 0.6345266699790955, 'learning_rate': 1.926559523809524e-05, 'epoch': 0.22142857142857142}
|
63 |
+
Step: 314 {'loss': 0.5001, 'grad_norm': 0.6739416122436523, 'learning_rate': 1.9253750000000002e-05, 'epoch': 0.225}
|
64 |
+
Step: 319 {'loss': 0.4823, 'grad_norm': 0.9729430079460144, 'learning_rate': 1.9241904761904764e-05, 'epoch': 0.22857142857142856}
|
65 |
+
Step: 324 {'loss': 0.4074, 'grad_norm': 0.8870615363121033, 'learning_rate': 1.9230059523809527e-05, 'epoch': 0.23214285714285715}
|
66 |
+
Step: 329 {'loss': 0.404, 'grad_norm': 0.6122156381607056, 'learning_rate': 1.921821428571429e-05, 'epoch': 0.2357142857142857}
|
67 |
+
Step: 334 {'loss': 0.399, 'grad_norm': 0.7009484767913818, 'learning_rate': 1.920636904761905e-05, 'epoch': 0.2392857142857143}
|
68 |
+
Step: 339 {'loss': 0.4015, 'grad_norm': 0.9186666011810303, 'learning_rate': 1.919452380952381e-05, 'epoch': 0.24285714285714285}
|
69 |
+
Step: 344 {'loss': 0.3585, 'grad_norm': 0.7455053925514221, 'learning_rate': 1.9182678571428574e-05, 'epoch': 0.24642857142857144}
|
70 |
+
Step: 349 {'loss': 0.3825, 'grad_norm': 0.6361353993415833, 'learning_rate': 1.9170833333333336e-05, 'epoch': 0.25}
|
71 |
+
Step: 354 {'loss': 0.4294, 'grad_norm': 0.8647685050964355, 'learning_rate': 1.91589880952381e-05, 'epoch': 0.25357142857142856}
|
72 |
+
Step: 359 {'loss': 0.3671, 'grad_norm': 0.5729554295539856, 'learning_rate': 1.9147142857142858e-05, 'epoch': 0.2571428571428571}
|
73 |
+
Step: 364 {'loss': 0.3255, 'grad_norm': 0.5804448127746582, 'learning_rate': 1.913529761904762e-05, 'epoch': 0.26071428571428573}
|
74 |
+
Step: 369 {'loss': 0.513, 'grad_norm': 0.8690835237503052, 'learning_rate': 1.9123452380952383e-05, 'epoch': 0.2642857142857143}
|
75 |
+
Step: 374 {'loss': 0.3854, 'grad_norm': 1.04226553440094, 'learning_rate': 1.9111607142857146e-05, 'epoch': 0.26785714285714285}
|
76 |
+
Step: 379 {'loss': 0.3508, 'grad_norm': 0.9402156472206116, 'learning_rate': 1.9099761904761908e-05, 'epoch': 0.2714285714285714}
|
77 |
+
Step: 384 {'loss': 0.4982, 'grad_norm': 0.6728529930114746, 'learning_rate': 1.908791666666667e-05, 'epoch': 0.275}
|
78 |
+
Step: 389 {'loss': 0.3801, 'grad_norm': 0.6414267420768738, 'learning_rate': 1.907607142857143e-05, 'epoch': 0.2785714285714286}
|
79 |
+
Step: 394 {'loss': 0.4152, 'grad_norm': 1.0602443218231201, 'learning_rate': 1.9064226190476192e-05, 'epoch': 0.28214285714285714}
|
80 |
+
Step: 399 {'loss': 0.4706, 'grad_norm': 0.5564787983894348, 'learning_rate': 1.905238095238095e-05, 'epoch': 0.2857142857142857}
|
81 |
+
Step: 404 {'loss': 0.3742, 'grad_norm': 0.8708383440971375, 'learning_rate': 1.9040535714285714e-05, 'epoch': 0.2892857142857143}
|
82 |
+
Step: 409 {'loss': 0.4291, 'grad_norm': 1.4418401718139648, 'learning_rate': 1.9028690476190477e-05, 'epoch': 0.29285714285714287}
|
83 |
+
Step: 414 {'loss': 0.4047, 'grad_norm': 0.877226710319519, 'learning_rate': 1.901684523809524e-05, 'epoch': 0.29642857142857143}
|
84 |
+
Step: 419 {'loss': 0.5602, 'grad_norm': 0.9882636666297913, 'learning_rate': 1.9005000000000002e-05, 'epoch': 0.3}
|
85 |
+
Step: 424 {'loss': 0.4152, 'grad_norm': 0.6507172584533691, 'learning_rate': 1.899315476190476e-05, 'epoch': 0.30357142857142855}
|
86 |
+
Step: 429 {'loss': 0.385, 'grad_norm': 0.8154886960983276, 'learning_rate': 1.8981309523809524e-05, 'epoch': 0.30714285714285716}
|
87 |
+
Step: 434 {'loss': 0.4448, 'grad_norm': 1.0586435794830322, 'learning_rate': 1.8969464285714286e-05, 'epoch': 0.3107142857142857}
|
88 |
+
Step: 439 {'loss': 0.4122, 'grad_norm': 0.9430441856384277, 'learning_rate': 1.895761904761905e-05, 'epoch': 0.3142857142857143}
|
89 |
+
Step: 444 {'loss': 0.3908, 'grad_norm': 0.5545459985733032, 'learning_rate': 1.894577380952381e-05, 'epoch': 0.31785714285714284}
|
90 |
+
Step: 449 {'loss': 0.4621, 'grad_norm': 0.9478644132614136, 'learning_rate': 1.8933928571428574e-05, 'epoch': 0.32142857142857145}
|
91 |
+
Step: 454 {'loss': 0.3571, 'grad_norm': 0.5178350210189819, 'learning_rate': 1.8922083333333333e-05, 'epoch': 0.325}
|
92 |
+
Step: 459 {'loss': 0.283, 'grad_norm': 0.544847846031189, 'learning_rate': 1.8910238095238095e-05, 'epoch': 0.32857142857142857}
|
93 |
+
Step: 464 {'loss': 0.3541, 'grad_norm': 0.736697256565094, 'learning_rate': 1.8898392857142858e-05, 'epoch': 0.33214285714285713}
|
94 |
+
Step: 469 {'loss': 0.4792, 'grad_norm': 1.094739556312561, 'learning_rate': 1.888654761904762e-05, 'epoch': 0.3357142857142857}
|
95 |
+
Step: 474 {'loss': 0.4067, 'grad_norm': 0.8329822421073914, 'learning_rate': 1.8874702380952383e-05, 'epoch': 0.3392857142857143}
|
96 |
+
Step: 479 {'loss': 0.4462, 'grad_norm': 0.7292098999023438, 'learning_rate': 1.8862857142857142e-05, 'epoch': 0.34285714285714286}
|
97 |
+
Step: 484 {'loss': 0.4539, 'grad_norm': 0.8009581565856934, 'learning_rate': 1.8851011904761905e-05, 'epoch': 0.3464285714285714}
|
98 |
+
Step: 489 {'loss': 0.4553, 'grad_norm': 1.1885050535202026, 'learning_rate': 1.8839166666666667e-05, 'epoch': 0.35}
|
99 |
+
Step: 494 {'loss': 0.2783, 'grad_norm': 0.5375037789344788, 'learning_rate': 1.882732142857143e-05, 'epoch': 0.3535714285714286}
|
100 |
+
Step: 499 {'loss': 0.3999, 'grad_norm': 0.8919097185134888, 'learning_rate': 1.8815476190476193e-05, 'epoch': 0.35714285714285715}
|
101 |
+
Step: 504 {'loss': 0.3989, 'grad_norm': 0.8732523918151855, 'learning_rate': 1.8803630952380955e-05, 'epoch': 0.3607142857142857}
|
102 |
+
Step: 509 {'loss': 0.4154, 'grad_norm': 0.5995797514915466, 'learning_rate': 1.8791785714285714e-05, 'epoch': 0.36428571428571427}
|
103 |
+
Step: 514 {'loss': 0.3292, 'grad_norm': 0.6189977526664734, 'learning_rate': 1.8779940476190477e-05, 'epoch': 0.3678571428571429}
|
104 |
+
Step: 519 {'loss': 0.405, 'grad_norm': 0.8150209188461304, 'learning_rate': 1.876809523809524e-05, 'epoch': 0.37142857142857144}
|
105 |
+
Step: 524 {'loss': 0.3163, 'grad_norm': 0.9416306018829346, 'learning_rate': 1.8756250000000002e-05, 'epoch': 0.375}
|
106 |
+
Step: 529 {'loss': 0.4091, 'grad_norm': 1.3431493043899536, 'learning_rate': 1.8744404761904764e-05, 'epoch': 0.37857142857142856}
|
107 |
+
Step: 534 {'loss': 0.3866, 'grad_norm': 0.6864269375801086, 'learning_rate': 1.8732559523809527e-05, 'epoch': 0.3821428571428571}
|
108 |
+
Step: 539 {'loss': 0.4974, 'grad_norm': 1.5816361904144287, 'learning_rate': 1.8720714285714286e-05, 'epoch': 0.38571428571428573}
|
109 |
+
Step: 544 {'loss': 0.3429, 'grad_norm': 0.9576981663703918, 'learning_rate': 1.870886904761905e-05, 'epoch': 0.3892857142857143}
|
110 |
+
Step: 549 {'loss': 0.5506, 'grad_norm': 0.8996490836143494, 'learning_rate': 1.869702380952381e-05, 'epoch': 0.39285714285714285}
|
111 |
+
Step: 554 {'loss': 0.4465, 'grad_norm': 0.6192946434020996, 'learning_rate': 1.8685178571428574e-05, 'epoch': 0.3964285714285714}
|
112 |
+
Step: 559 {'loss': 0.5349, 'grad_norm': 0.8320725560188293, 'learning_rate': 1.8673333333333336e-05, 'epoch': 0.4}
|
113 |
+
Step: 564 {'loss': 0.4332, 'grad_norm': 0.984227180480957, 'learning_rate': 1.8661488095238096e-05, 'epoch': 0.4035714285714286}
|
114 |
+
Step: 569 {'loss': 0.3875, 'grad_norm': 0.9194024205207825, 'learning_rate': 1.8649642857142858e-05, 'epoch': 0.40714285714285714}
|
115 |
+
Step: 574 {'loss': 0.3601, 'grad_norm': 0.7953531742095947, 'learning_rate': 1.863779761904762e-05, 'epoch': 0.4107142857142857}
|
116 |
+
Step: 579 {'loss': 0.3632, 'grad_norm': 0.7107942700386047, 'learning_rate': 1.8625952380952383e-05, 'epoch': 0.4142857142857143}
|
117 |
+
Step: 584 {'loss': 0.3376, 'grad_norm': 1.1161280870437622, 'learning_rate': 1.8614107142857146e-05, 'epoch': 0.41785714285714287}
|
118 |
+
Step: 589 {'loss': 0.3145, 'grad_norm': 0.7818060517311096, 'learning_rate': 1.8602261904761908e-05, 'epoch': 0.42142857142857143}
|
119 |
+
Step: 594 {'loss': 0.3659, 'grad_norm': 0.9914860129356384, 'learning_rate': 1.8590416666666667e-05, 'epoch': 0.425}
|
120 |
+
Step: 599 {'loss': 0.4318, 'grad_norm': 1.2383373975753784, 'learning_rate': 1.857857142857143e-05, 'epoch': 0.42857142857142855}
|
121 |
+
Step: 604 {'loss': 0.3811, 'grad_norm': 1.2427709102630615, 'learning_rate': 1.8566726190476193e-05, 'epoch': 0.43214285714285716}
|
122 |
+
Step: 609 {'loss': 0.4007, 'grad_norm': 0.795452356338501, 'learning_rate': 1.8554880952380955e-05, 'epoch': 0.4357142857142857}
|
123 |
+
Step: 614 {'loss': 0.4641, 'grad_norm': 0.7359730005264282, 'learning_rate': 1.8543035714285718e-05, 'epoch': 0.4392857142857143}
|
124 |
+
Step: 619 {'loss': 0.3813, 'grad_norm': 1.0183659791946411, 'learning_rate': 1.8531190476190477e-05, 'epoch': 0.44285714285714284}
|
125 |
+
Step: 624 {'loss': 0.3183, 'grad_norm': 0.6068124175071716, 'learning_rate': 1.851934523809524e-05, 'epoch': 0.44642857142857145}
|
126 |
+
Step: 629 {'loss': 0.3476, 'grad_norm': 0.8716106414794922, 'learning_rate': 1.8507500000000002e-05, 'epoch': 0.45}
|
127 |
+
Step: 634 {'loss': 0.3398, 'grad_norm': 0.8179718852043152, 'learning_rate': 1.8495654761904765e-05, 'epoch': 0.45357142857142857}
|
128 |
+
Step: 639 {'loss': 0.3957, 'grad_norm': 0.8983686566352844, 'learning_rate': 1.8483809523809527e-05, 'epoch': 0.45714285714285713}
|
129 |
+
Step: 644 {'loss': 0.4374, 'grad_norm': 1.2546746730804443, 'learning_rate': 1.847196428571429e-05, 'epoch': 0.4607142857142857}
|
130 |
+
Step: 649 {'loss': 0.4142, 'grad_norm': 0.9628292322158813, 'learning_rate': 1.846011904761905e-05, 'epoch': 0.4642857142857143}
|
131 |
+
Step: 654 {'loss': 0.371, 'grad_norm': 1.2992581129074097, 'learning_rate': 1.844827380952381e-05, 'epoch': 0.46785714285714286}
|
132 |
+
Step: 659 {'loss': 0.3134, 'grad_norm': 0.9638750553131104, 'learning_rate': 1.8436428571428574e-05, 'epoch': 0.4714285714285714}
|
133 |
+
Step: 664 {'loss': 0.2918, 'grad_norm': 0.7682401537895203, 'learning_rate': 1.8424583333333336e-05, 'epoch': 0.475}
|
134 |
+
Step: 669 {'loss': 0.3406, 'grad_norm': 0.7767547965049744, 'learning_rate': 1.84127380952381e-05, 'epoch': 0.4785714285714286}
|
135 |
+
Step: 674 {'loss': 0.4479, 'grad_norm': 1.2478020191192627, 'learning_rate': 1.8400892857142858e-05, 'epoch': 0.48214285714285715}
|
136 |
+
Step: 679 {'loss': 0.2975, 'grad_norm': 1.1172363758087158, 'learning_rate': 1.838904761904762e-05, 'epoch': 0.4857142857142857}
|
137 |
+
Step: 684 {'loss': 0.3755, 'grad_norm': 0.6458436846733093, 'learning_rate': 1.837720238095238e-05, 'epoch': 0.48928571428571427}
|
138 |
+
Step: 689 {'loss': 0.4156, 'grad_norm': 0.7070118188858032, 'learning_rate': 1.8365357142857142e-05, 'epoch': 0.4928571428571429}
|
139 |
+
Step: 694 {'loss': 0.3455, 'grad_norm': 0.8581281900405884, 'learning_rate': 1.8353511904761905e-05, 'epoch': 0.49642857142857144}
|
140 |
+
Step: 699 {'loss': 0.3322, 'grad_norm': 0.7677241563796997, 'learning_rate': 1.8341666666666668e-05, 'epoch': 0.5}
|
141 |
+
Step: 704 {'loss': 0.3077, 'grad_norm': 0.6088917851448059, 'learning_rate': 1.832982142857143e-05, 'epoch': 0.5035714285714286}
|
142 |
+
Step: 709 {'loss': 0.332, 'grad_norm': 0.7178571224212646, 'learning_rate': 1.8317976190476193e-05, 'epoch': 0.5071428571428571}
|
143 |
+
Step: 714 {'loss': 0.3245, 'grad_norm': 0.7131450772285461, 'learning_rate': 1.8306130952380952e-05, 'epoch': 0.5107142857142857}
|
144 |
+
Step: 719 {'loss': 0.3513, 'grad_norm': 1.0770076513290405, 'learning_rate': 1.8294285714285714e-05, 'epoch': 0.5142857142857142}
|
145 |
+
Step: 724 {'loss': 0.2694, 'grad_norm': 0.798261821269989, 'learning_rate': 1.8282440476190477e-05, 'epoch': 0.5178571428571429}
|
146 |
+
Step: 729 {'loss': 0.2473, 'grad_norm': 0.7949568033218384, 'learning_rate': 1.827059523809524e-05, 'epoch': 0.5214285714285715}
|
147 |
+
Step: 734 {'loss': 0.4606, 'grad_norm': 1.0194681882858276, 'learning_rate': 1.8258750000000002e-05, 'epoch': 0.525}
|
148 |
+
Step: 739 {'loss': 0.3139, 'grad_norm': 0.6697856187820435, 'learning_rate': 1.8246904761904765e-05, 'epoch': 0.5285714285714286}
|
149 |
+
Step: 744 {'loss': 0.3363, 'grad_norm': 1.0922176837921143, 'learning_rate': 1.8235059523809524e-05, 'epoch': 0.5321428571428571}
|
150 |
+
Step: 749 {'loss': 0.3805, 'grad_norm': 0.5025736689567566, 'learning_rate': 1.8223214285714286e-05, 'epoch': 0.5357142857142857}
|
151 |
+
Step: 754 {'loss': 0.4875, 'grad_norm': 0.9163244366645813, 'learning_rate': 1.821136904761905e-05, 'epoch': 0.5392857142857143}
|
152 |
+
Step: 759 {'loss': 0.4214, 'grad_norm': 1.0958220958709717, 'learning_rate': 1.819952380952381e-05, 'epoch': 0.5428571428571428}
|
153 |
+
Step: 764 {'loss': 0.291, 'grad_norm': 0.7918387651443481, 'learning_rate': 1.8187678571428574e-05, 'epoch': 0.5464285714285714}
|
154 |
+
Step: 769 {'loss': 0.3236, 'grad_norm': 1.247233271598816, 'learning_rate': 1.8175833333333333e-05, 'epoch': 0.55}
|
155 |
+
Step: 774 {'loss': 0.2934, 'grad_norm': 0.5760002732276917, 'learning_rate': 1.8163988095238096e-05, 'epoch': 0.5535714285714286}
|
156 |
+
Step: 779 {'loss': 0.3677, 'grad_norm': 0.63747239112854, 'learning_rate': 1.8152142857142858e-05, 'epoch': 0.5571428571428572}
|
157 |
+
Step: 784 {'loss': 0.4133, 'grad_norm': 0.7723174691200256, 'learning_rate': 1.814029761904762e-05, 'epoch': 0.5607142857142857}
|
158 |
+
Step: 789 {'loss': 0.3961, 'grad_norm': 0.7719770669937134, 'learning_rate': 1.8128452380952383e-05, 'epoch': 0.5642857142857143}
|
159 |
+
Step: 794 {'loss': 0.3038, 'grad_norm': 0.5547974705696106, 'learning_rate': 1.8116607142857146e-05, 'epoch': 0.5678571428571428}
|
160 |
+
Step: 799 {'loss': 0.3156, 'grad_norm': 0.7582687735557556, 'learning_rate': 1.8104761904761905e-05, 'epoch': 0.5714285714285714}
|
161 |
+
Step: 804 {'loss': 0.2789, 'grad_norm': 0.7563489675521851, 'learning_rate': 1.8092916666666668e-05, 'epoch': 0.575}
|
162 |
+
Step: 809 {'loss': 0.4258, 'grad_norm': 0.6750617027282715, 'learning_rate': 1.808107142857143e-05, 'epoch': 0.5785714285714286}
|
163 |
+
Step: 814 {'loss': 0.3774, 'grad_norm': 0.9238749146461487, 'learning_rate': 1.8069226190476193e-05, 'epoch': 0.5821428571428572}
|
164 |
+
Step: 819 {'loss': 0.3267, 'grad_norm': 0.9808230400085449, 'learning_rate': 1.8057380952380955e-05, 'epoch': 0.5857142857142857}
|
165 |
+
Step: 824 {'loss': 0.3943, 'grad_norm': 0.673556923866272, 'learning_rate': 1.8045535714285714e-05, 'epoch': 0.5892857142857143}
|
166 |
+
Step: 829 {'loss': 0.4032, 'grad_norm': 0.9122436046600342, 'learning_rate': 1.8033690476190477e-05, 'epoch': 0.5928571428571429}
|
167 |
+
Step: 834 {'loss': 0.4152, 'grad_norm': 1.0862387418746948, 'learning_rate': 1.802184523809524e-05, 'epoch': 0.5964285714285714}
|
168 |
+
Step: 839 {'loss': 0.3155, 'grad_norm': 0.4949910640716553, 'learning_rate': 1.8010000000000002e-05, 'epoch': 0.6}
|
169 |
+
Step: 844 {'loss': 0.3416, 'grad_norm': 1.0176829099655151, 'learning_rate': 1.7998154761904765e-05, 'epoch': 0.6035714285714285}
|
170 |
+
Step: 849 {'loss': 0.421, 'grad_norm': 0.964309811592102, 'learning_rate': 1.7986309523809527e-05, 'epoch': 0.6071428571428571}
|
171 |
+
Step: 854 {'loss': 0.3827, 'grad_norm': 0.7528172731399536, 'learning_rate': 1.7974464285714286e-05, 'epoch': 0.6107142857142858}
|
172 |
+
Step: 859 {'loss': 0.434, 'grad_norm': 0.6447746753692627, 'learning_rate': 1.796261904761905e-05, 'epoch': 0.6142857142857143}
|
173 |
+
Step: 864 {'loss': 0.294, 'grad_norm': 0.7310487627983093, 'learning_rate': 1.795077380952381e-05, 'epoch': 0.6178571428571429}
|
174 |
+
Step: 869 {'loss': 0.4067, 'grad_norm': 0.8082703948020935, 'learning_rate': 1.7938928571428574e-05, 'epoch': 0.6214285714285714}
|
175 |
+
Step: 874 {'loss': 0.4972, 'grad_norm': 1.139521837234497, 'learning_rate': 1.7927083333333337e-05, 'epoch': 0.625}
|
176 |
+
Step: 879 {'loss': 0.4101, 'grad_norm': 0.8615649938583374, 'learning_rate': 1.7915238095238096e-05, 'epoch': 0.6285714285714286}
|
177 |
+
Step: 884 {'loss': 0.3884, 'grad_norm': 1.295130968093872, 'learning_rate': 1.7903392857142858e-05, 'epoch': 0.6321428571428571}
|
178 |
+
Step: 889 {'loss': 0.3556, 'grad_norm': 0.6960245370864868, 'learning_rate': 1.789154761904762e-05, 'epoch': 0.6357142857142857}
|
179 |
+
Step: 894 {'loss': 0.3243, 'grad_norm': 0.7199245691299438, 'learning_rate': 1.7879702380952383e-05, 'epoch': 0.6392857142857142}
|
180 |
+
Step: 899 {'loss': 0.5436, 'grad_norm': 0.8263904452323914, 'learning_rate': 1.7867857142857146e-05, 'epoch': 0.6428571428571429}
|
181 |
+
Step: 904 {'loss': 0.3568, 'grad_norm': 0.7996845245361328, 'learning_rate': 1.7856011904761905e-05, 'epoch': 0.6464285714285715}
|
182 |
+
Step: 909 {'loss': 0.2545, 'grad_norm': 0.8239178657531738, 'learning_rate': 1.7844166666666668e-05, 'epoch': 0.65}
|
183 |
+
Step: 914 {'loss': 0.3934, 'grad_norm': 0.8466132283210754, 'learning_rate': 1.783232142857143e-05, 'epoch': 0.6535714285714286}
|
184 |
+
Step: 919 {'loss': 0.2826, 'grad_norm': 0.6736989617347717, 'learning_rate': 1.782047619047619e-05, 'epoch': 0.6571428571428571}
|
185 |
+
Step: 924 {'loss': 0.476, 'grad_norm': 1.1140491962432861, 'learning_rate': 1.7808630952380952e-05, 'epoch': 0.6607142857142857}
|
186 |
+
Step: 929 {'loss': 0.3631, 'grad_norm': 0.7517083883285522, 'learning_rate': 1.7796785714285714e-05, 'epoch': 0.6642857142857143}
|
187 |
+
Step: 934 {'loss': 0.3114, 'grad_norm': 0.8430672287940979, 'learning_rate': 1.7784940476190477e-05, 'epoch': 0.6678571428571428}
|
188 |
+
Step: 939 {'loss': 0.3025, 'grad_norm': 0.5135239362716675, 'learning_rate': 1.777309523809524e-05, 'epoch': 0.6714285714285714}
|
189 |
+
Step: 944 {'loss': 0.3271, 'grad_norm': 0.918813169002533, 'learning_rate': 1.7761250000000002e-05, 'epoch': 0.675}
|
190 |
+
Step: 949 {'loss': 0.388, 'grad_norm': 0.9189344644546509, 'learning_rate': 1.774940476190476e-05, 'epoch': 0.6785714285714286}
|
191 |
+
Step: 954 {'loss': 0.4937, 'grad_norm': 1.078315258026123, 'learning_rate': 1.7737559523809524e-05, 'epoch': 0.6821428571428572}
|
192 |
+
Step: 959 {'loss': 0.3456, 'grad_norm': 1.0054786205291748, 'learning_rate': 1.7725714285714286e-05, 'epoch': 0.6857142857142857}
|
193 |
+
Step: 964 {'loss': 0.3445, 'grad_norm': 0.8630911707878113, 'learning_rate': 1.771386904761905e-05, 'epoch': 0.6892857142857143}
|
194 |
+
Step: 969 {'loss': 0.3572, 'grad_norm': 0.8356649279594421, 'learning_rate': 1.770202380952381e-05, 'epoch': 0.6928571428571428}
|
195 |
+
Step: 974 {'loss': 0.5159, 'grad_norm': 0.9866499900817871, 'learning_rate': 1.769017857142857e-05, 'epoch': 0.6964285714285714}
|
196 |
+
Step: 979 {'loss': 0.3134, 'grad_norm': 1.7429964542388916, 'learning_rate': 1.7678333333333333e-05, 'epoch': 0.7}
|
197 |
+
Step: 984 {'loss': 0.39, 'grad_norm': 1.1718984842300415, 'learning_rate': 1.7666488095238096e-05, 'epoch': 0.7035714285714286}
|
198 |
+
Step: 989 {'loss': 0.3215, 'grad_norm': 0.6568397283554077, 'learning_rate': 1.7654642857142858e-05, 'epoch': 0.7071428571428572}
|
199 |
+
Step: 994 {'loss': 0.329, 'grad_norm': 0.9722153544425964, 'learning_rate': 1.764279761904762e-05, 'epoch': 0.7107142857142857}
|
200 |
+
Step: 999 {'loss': 0.363, 'grad_norm': 0.7853933572769165, 'learning_rate': 1.7630952380952383e-05, 'epoch': 0.7142857142857143}
|
201 |
+
Step: 1004 {'loss': 0.2597, 'grad_norm': 0.7332313060760498, 'learning_rate': 1.7619107142857143e-05, 'epoch': 0.7178571428571429}
|
202 |
+
Step: 1009 {'loss': 0.2838, 'grad_norm': 0.690869927406311, 'learning_rate': 1.7607261904761905e-05, 'epoch': 0.7214285714285714}
|
203 |
+
Step: 1014 {'loss': 0.2714, 'grad_norm': 0.7609320282936096, 'learning_rate': 1.7595416666666668e-05, 'epoch': 0.725}
|
204 |
+
Step: 1019 {'loss': 0.2741, 'grad_norm': 0.7796645760536194, 'learning_rate': 1.758357142857143e-05, 'epoch': 0.7285714285714285}
|
205 |
+
Step: 1024 {'loss': 0.2725, 'grad_norm': 0.7715787887573242, 'learning_rate': 1.7571726190476193e-05, 'epoch': 0.7321428571428571}
|
206 |
+
Step: 1029 {'loss': 0.2917, 'grad_norm': 0.9862931370735168, 'learning_rate': 1.7559880952380952e-05, 'epoch': 0.7357142857142858}
|
207 |
+
Step: 1034 {'loss': 0.3189, 'grad_norm': 0.998388409614563, 'learning_rate': 1.7548035714285715e-05, 'epoch': 0.7392857142857143}
|
208 |
+
Step: 1039 {'loss': 0.3415, 'grad_norm': 1.0296244621276855, 'learning_rate': 1.7536190476190477e-05, 'epoch': 0.7428571428571429}
|
209 |
+
Step: 1044 {'loss': 0.3719, 'grad_norm': 0.6839652061462402, 'learning_rate': 1.752434523809524e-05, 'epoch': 0.7464285714285714}
|
210 |
+
Step: 1049 {'loss': 0.3024, 'grad_norm': 1.1470963954925537, 'learning_rate': 1.7512500000000002e-05, 'epoch': 0.75}
|
211 |
+
Step: 1054 {'loss': 0.408, 'grad_norm': 1.2644574642181396, 'learning_rate': 1.7500654761904765e-05, 'epoch': 0.7535714285714286}
|
212 |
+
Step: 1059 {'loss': 0.4078, 'grad_norm': 0.8794793486595154, 'learning_rate': 1.7488809523809524e-05, 'epoch': 0.7571428571428571}
|
213 |
+
Step: 1064 {'loss': 0.3511, 'grad_norm': 0.9097239971160889, 'learning_rate': 1.7476964285714286e-05, 'epoch': 0.7607142857142857}
|
214 |
+
Step: 1069 {'loss': 0.3949, 'grad_norm': 0.6323068737983704, 'learning_rate': 1.746511904761905e-05, 'epoch': 0.7642857142857142}
|
215 |
+
Step: 1074 {'loss': 0.3794, 'grad_norm': 0.6272197365760803, 'learning_rate': 1.745327380952381e-05, 'epoch': 0.7678571428571429}
|
216 |
+
Step: 1079 {'loss': 0.3388, 'grad_norm': 0.8878163695335388, 'learning_rate': 1.7441428571428574e-05, 'epoch': 0.7714285714285715}
|
217 |
+
Step: 1084 {'loss': 0.4305, 'grad_norm': 1.036283016204834, 'learning_rate': 1.7429583333333333e-05, 'epoch': 0.775}
|
218 |
+
Step: 1089 {'loss': 0.3446, 'grad_norm': 1.0395066738128662, 'learning_rate': 1.7417738095238096e-05, 'epoch': 0.7785714285714286}
|
219 |
+
Step: 1094 {'loss': 0.3496, 'grad_norm': 0.6800899505615234, 'learning_rate': 1.740589285714286e-05, 'epoch': 0.7821428571428571}
|
220 |
+
Step: 1099 {'loss': 0.3291, 'grad_norm': 1.1891331672668457, 'learning_rate': 1.739404761904762e-05, 'epoch': 0.7857142857142857}
|
221 |
+
Step: 1104 {'loss': 0.3476, 'grad_norm': 0.8779275417327881, 'learning_rate': 1.7382202380952383e-05, 'epoch': 0.7892857142857143}
|
222 |
+
Step: 1109 {'loss': 0.3047, 'grad_norm': 0.853313684463501, 'learning_rate': 1.7370357142857146e-05, 'epoch': 0.7928571428571428}
|
223 |
+
Step: 1114 {'loss': 0.3375, 'grad_norm': 0.96327143907547, 'learning_rate': 1.7358511904761905e-05, 'epoch': 0.7964285714285714}
|
224 |
+
Step: 1119 {'loss': 0.3974, 'grad_norm': 1.0462590456008911, 'learning_rate': 1.7346666666666668e-05, 'epoch': 0.8}
|
225 |
+
Step: 1124 {'loss': 0.332, 'grad_norm': 0.6860864758491516, 'learning_rate': 1.733482142857143e-05, 'epoch': 0.8035714285714286}
|
226 |
+
Step: 1129 {'loss': 0.5425, 'grad_norm': 1.0949068069458008, 'learning_rate': 1.7322976190476193e-05, 'epoch': 0.8071428571428572}
|
227 |
+
Step: 1134 {'loss': 0.6373, 'grad_norm': 0.7758826613426208, 'learning_rate': 1.7311130952380955e-05, 'epoch': 0.8107142857142857}
|
228 |
+
Step: 1139 {'loss': 0.2984, 'grad_norm': 0.8225033283233643, 'learning_rate': 1.7299285714285718e-05, 'epoch': 0.8142857142857143}
|
229 |
+
Step: 1144 {'loss': 0.3331, 'grad_norm': 0.4973011910915375, 'learning_rate': 1.7287440476190477e-05, 'epoch': 0.8178571428571428}
|
230 |
+
Step: 1149 {'loss': 0.2866, 'grad_norm': 1.2448415756225586, 'learning_rate': 1.727559523809524e-05, 'epoch': 0.8214285714285714}
|
231 |
+
Step: 1154 {'loss': 0.3561, 'grad_norm': 0.8659022450447083, 'learning_rate': 1.7263750000000002e-05, 'epoch': 0.825}
|
232 |
+
Step: 1159 {'loss': 0.3392, 'grad_norm': 1.0248117446899414, 'learning_rate': 1.7251904761904765e-05, 'epoch': 0.8285714285714286}
|
233 |
+
Step: 1164 {'loss': 0.364, 'grad_norm': 0.7102665305137634, 'learning_rate': 1.7240059523809527e-05, 'epoch': 0.8321428571428572}
|
234 |
+
Step: 1169 {'loss': 0.3458, 'grad_norm': 0.7900285124778748, 'learning_rate': 1.7228214285714286e-05, 'epoch': 0.8357142857142857}
|
235 |
+
Step: 1174 {'loss': 0.3239, 'grad_norm': 1.0749526023864746, 'learning_rate': 1.721636904761905e-05, 'epoch': 0.8392857142857143}
|
236 |
+
Step: 1179 {'loss': 0.4444, 'grad_norm': 0.6646351218223572, 'learning_rate': 1.720452380952381e-05, 'epoch': 0.8428571428571429}
|
237 |
+
Step: 1184 {'loss': 0.3368, 'grad_norm': 0.6927091479301453, 'learning_rate': 1.7192678571428574e-05, 'epoch': 0.8464285714285714}
|
238 |
+
Step: 1189 {'loss': 0.2657, 'grad_norm': 0.7363135814666748, 'learning_rate': 1.7180833333333337e-05, 'epoch': 0.85}
|
239 |
+
Step: 1194 {'loss': 0.4087, 'grad_norm': 1.0950276851654053, 'learning_rate': 1.71689880952381e-05, 'epoch': 0.8535714285714285}
|
240 |
+
Step: 1199 {'loss': 0.3012, 'grad_norm': 0.8317290544509888, 'learning_rate': 1.715714285714286e-05, 'epoch': 0.8571428571428571}
|
241 |
+
Step: 1204 {'loss': 0.3327, 'grad_norm': 0.8805026412010193, 'learning_rate': 1.714529761904762e-05, 'epoch': 0.8607142857142858}
|
242 |
+
Step: 1209 {'loss': 0.2755, 'grad_norm': 0.5442625880241394, 'learning_rate': 1.7133452380952384e-05, 'epoch': 0.8642857142857143}
|
243 |
+
Step: 1214 {'loss': 0.3451, 'grad_norm': 0.8105046153068542, 'learning_rate': 1.7121607142857143e-05, 'epoch': 0.8678571428571429}
|
244 |
+
Step: 1219 {'loss': 0.2655, 'grad_norm': 0.8646172881126404, 'learning_rate': 1.7109761904761905e-05, 'epoch': 0.8714285714285714}
|
245 |
+
Step: 1224 {'loss': 0.3989, 'grad_norm': 0.6593888401985168, 'learning_rate': 1.7097916666666668e-05, 'epoch': 0.875}
|
246 |
+
Step: 1229 {'loss': 0.269, 'grad_norm': 0.6822863221168518, 'learning_rate': 1.708607142857143e-05, 'epoch': 0.8785714285714286}
|
247 |
+
Step: 1234 {'loss': 0.3261, 'grad_norm': 0.8558134436607361, 'learning_rate': 1.707422619047619e-05, 'epoch': 0.8821428571428571}
|
248 |
+
Step: 1239 {'loss': 0.317, 'grad_norm': 0.9368207454681396, 'learning_rate': 1.7062380952380952e-05, 'epoch': 0.8857142857142857}
|
249 |
+
Step: 1244 {'loss': 0.3394, 'grad_norm': 1.1915005445480347, 'learning_rate': 1.7050535714285715e-05, 'epoch': 0.8892857142857142}
|
250 |
+
Step: 1249 {'loss': 0.3968, 'grad_norm': 1.3165326118469238, 'learning_rate': 1.7038690476190477e-05, 'epoch': 0.8928571428571429}
|
251 |
+
Step: 1254 {'loss': 0.3263, 'grad_norm': 1.0021530389785767, 'learning_rate': 1.702684523809524e-05, 'epoch': 0.8964285714285715}
|
252 |
+
Step: 1259 {'loss': 0.3063, 'grad_norm': 0.679772138595581, 'learning_rate': 1.7015000000000002e-05, 'epoch': 0.9}
|
253 |
+
Step: 1264 {'loss': 0.3759, 'grad_norm': 1.0995697975158691, 'learning_rate': 1.700315476190476e-05, 'epoch': 0.9035714285714286}
|
254 |
+
Step: 1269 {'loss': 0.377, 'grad_norm': 0.7673999071121216, 'learning_rate': 1.6991309523809524e-05, 'epoch': 0.9071428571428571}
|
255 |
+
Step: 1274 {'loss': 0.2665, 'grad_norm': 0.9506070613861084, 'learning_rate': 1.6979464285714287e-05, 'epoch': 0.9107142857142857}
|
256 |
+
Step: 1279 {'loss': 0.337, 'grad_norm': 0.766395092010498, 'learning_rate': 1.696761904761905e-05, 'epoch': 0.9142857142857143}
|
257 |
+
Step: 1284 {'loss': 0.4346, 'grad_norm': 0.9738909602165222, 'learning_rate': 1.695577380952381e-05, 'epoch': 0.9178571428571428}
|
258 |
+
Step: 1289 {'loss': 0.3746, 'grad_norm': 0.7770394086837769, 'learning_rate': 1.694392857142857e-05, 'epoch': 0.9214285714285714}
|
259 |
+
Step: 1294 {'loss': 0.3313, 'grad_norm': 0.7297780513763428, 'learning_rate': 1.6932083333333333e-05, 'epoch': 0.925}
|
260 |
+
Step: 1299 {'loss': 0.2412, 'grad_norm': 1.143165111541748, 'learning_rate': 1.6920238095238096e-05, 'epoch': 0.9285714285714286}
|
261 |
+
Step: 1304 {'loss': 0.4314, 'grad_norm': 1.1894090175628662, 'learning_rate': 1.690839285714286e-05, 'epoch': 0.9321428571428572}
|
262 |
+
Step: 1309 {'loss': 0.2754, 'grad_norm': 0.7497856020927429, 'learning_rate': 1.689654761904762e-05, 'epoch': 0.9357142857142857}
|
263 |
+
Step: 1314 {'loss': 0.3167, 'grad_norm': 0.7976461052894592, 'learning_rate': 1.6884702380952384e-05, 'epoch': 0.9392857142857143}
|
264 |
+
Step: 1319 {'loss': 0.309, 'grad_norm': 0.8367032408714294, 'learning_rate': 1.6872857142857143e-05, 'epoch': 0.9428571428571428}
|
265 |
+
Step: 1324 {'loss': 0.2803, 'grad_norm': 0.7481916546821594, 'learning_rate': 1.6861011904761905e-05, 'epoch': 0.9464285714285714}
|
266 |
+
Step: 1329 {'loss': 0.3361, 'grad_norm': 1.4836623668670654, 'learning_rate': 1.6849166666666668e-05, 'epoch': 0.95}
|
267 |
+
Step: 1334 {'loss': 0.236, 'grad_norm': 0.7607225775718689, 'learning_rate': 1.683732142857143e-05, 'epoch': 0.9535714285714286}
|
268 |
+
Step: 1339 {'loss': 0.3701, 'grad_norm': 0.9192880988121033, 'learning_rate': 1.6825476190476193e-05, 'epoch': 0.9571428571428572}
|
269 |
+
Step: 1344 {'loss': 0.3533, 'grad_norm': 0.8493601083755493, 'learning_rate': 1.6813630952380955e-05, 'epoch': 0.9607142857142857}
|
270 |
+
Step: 1349 {'loss': 0.3261, 'grad_norm': 0.7196798324584961, 'learning_rate': 1.6801785714285715e-05, 'epoch': 0.9642857142857143}
|
271 |
+
Step: 1354 {'loss': 0.3694, 'grad_norm': 1.0612856149673462, 'learning_rate': 1.6789940476190477e-05, 'epoch': 0.9678571428571429}
|
272 |
+
Step: 1359 {'loss': 0.3639, 'grad_norm': 0.9527296423912048, 'learning_rate': 1.677809523809524e-05, 'epoch': 0.9714285714285714}
|
273 |
+
Step: 1364 {'loss': 0.3435, 'grad_norm': 0.7819812893867493, 'learning_rate': 1.6766250000000002e-05, 'epoch': 0.975}
|
274 |
+
Step: 1369 {'loss': 0.3111, 'grad_norm': 0.7119016647338867, 'learning_rate': 1.6754404761904765e-05, 'epoch': 0.9785714285714285}
|
275 |
+
Step: 1374 {'loss': 0.3079, 'grad_norm': 0.6907299757003784, 'learning_rate': 1.6742559523809524e-05, 'epoch': 0.9821428571428571}
|
276 |
+
Step: 1379 {'loss': 0.3241, 'grad_norm': 0.8788382411003113, 'learning_rate': 1.6730714285714287e-05, 'epoch': 0.9857142857142858}
|
277 |
+
Step: 1384 {'loss': 0.2445, 'grad_norm': 0.9006222486495972, 'learning_rate': 1.671886904761905e-05, 'epoch': 0.9892857142857143}
|
278 |
+
Step: 1389 {'loss': 0.2705, 'grad_norm': 0.8007070422172546, 'learning_rate': 1.670702380952381e-05, 'epoch': 0.9928571428571429}
|
279 |
+
Step: 1394 {'loss': 0.3579, 'grad_norm': 1.085525393486023, 'learning_rate': 1.6695178571428574e-05, 'epoch': 0.9964285714285714}
|
280 |
+
Step: 1399 {'loss': 0.278, 'grad_norm': 0.8911979794502258, 'learning_rate': 1.6683333333333337e-05, 'epoch': 1.0}
|
281 |
+
Interrupted by user
|
282 |
+
Step: 1401 {'train_runtime': 18039.657, 'train_samples_per_second': 1.863, 'train_steps_per_second': 0.466, 'train_loss': 0.4571322862499961, 'epoch': 1.0014285714285713}
|
283 |
+
17:57:53-413132 INFO LoRA training run is completed and saved.
|
284 |
+
17:57:53-510668 INFO Training complete, saving
|
285 |
+
17:57:53-597183 INFO Training interrupted.
|
286 |
+
|
287 |
+
|
288 |
+
|
config.json
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
{
|
2 |
-
"_name_or_path": ".\\
|
3 |
"architectures": [
|
4 |
"MixtralForCausalLM"
|
5 |
],
|
@@ -29,7 +29,7 @@
|
|
29 |
"sliding_window": null,
|
30 |
"tie_word_embeddings": false,
|
31 |
"torch_dtype": "bfloat16",
|
32 |
-
"transformers_version": "4.44.
|
33 |
"use_cache": false,
|
34 |
"vocab_size": 32064
|
35 |
}
|
|
|
1 |
{
|
2 |
+
"_name_or_path": ".\\BlackSheep",
|
3 |
"architectures": [
|
4 |
"MixtralForCausalLM"
|
5 |
],
|
|
|
29 |
"sliding_window": null,
|
30 |
"tie_word_embeddings": false,
|
31 |
"torch_dtype": "bfloat16",
|
32 |
+
"transformers_version": "4.44.2",
|
33 |
"use_cache": false,
|
34 |
"vocab_size": 32064
|
35 |
}
|
generation_config.json
CHANGED
@@ -2,5 +2,5 @@
|
|
2 |
"_from_model_config": true,
|
3 |
"bos_token_id": 1,
|
4 |
"eos_token_id": 32000,
|
5 |
-
"transformers_version": "4.44.
|
6 |
}
|
|
|
2 |
"_from_model_config": true,
|
3 |
"bos_token_id": 1,
|
4 |
"eos_token_id": 32000,
|
5 |
+
"transformers_version": "4.44.2"
|
6 |
}
|
model-00001-of-00004.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3c5801f3d971ed8ffe5c8c787bcbd09e0e954a46741a92e3a666da2c1011fdce
|
3 |
+
size 4991385392
|
model-00002-of-00004.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9cddb7085ddda505a7a73388c789fbe5f37ab266fa9340abe6f25a788b0e8266
|
3 |
+
size 4995729856
|
model-00003-of-00004.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1f03f0654b8149e0d9dc941791d9c08c734d1e43e079e5b1680232fbeb830d4a
|
3 |
+
size 4957962344
|
model-00004-of-00004.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:463bf1b25f6a83f7e414b9c001e59a5952c4b291fe2b99cf7c9e07c5841a8b75
|
3 |
+
size 2361411944
|
model.safetensors.index.json
ADDED
@@ -0,0 +1,522 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"metadata": {
|
3 |
+
"total_size": 17306425344
|
4 |
+
},
|
5 |
+
"weight_map": {
|
6 |
+
"lm_head.weight": "model-00004-of-00004.safetensors",
|
7 |
+
"model.embed_tokens.weight": "model-00001-of-00004.safetensors",
|
8 |
+
"model.layers.0.block_sparse_moe.experts.0.w1.weight": "model-00001-of-00004.safetensors",
|
9 |
+
"model.layers.0.block_sparse_moe.experts.0.w2.weight": "model-00001-of-00004.safetensors",
|
10 |
+
"model.layers.0.block_sparse_moe.experts.0.w3.weight": "model-00001-of-00004.safetensors",
|
11 |
+
"model.layers.0.block_sparse_moe.experts.1.w1.weight": "model-00001-of-00004.safetensors",
|
12 |
+
"model.layers.0.block_sparse_moe.experts.1.w2.weight": "model-00001-of-00004.safetensors",
|
13 |
+
"model.layers.0.block_sparse_moe.experts.1.w3.weight": "model-00001-of-00004.safetensors",
|
14 |
+
"model.layers.0.block_sparse_moe.experts.2.w1.weight": "model-00001-of-00004.safetensors",
|
15 |
+
"model.layers.0.block_sparse_moe.experts.2.w2.weight": "model-00001-of-00004.safetensors",
|
16 |
+
"model.layers.0.block_sparse_moe.experts.2.w3.weight": "model-00001-of-00004.safetensors",
|
17 |
+
"model.layers.0.block_sparse_moe.gate.weight": "model-00001-of-00004.safetensors",
|
18 |
+
"model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
19 |
+
"model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
20 |
+
"model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
21 |
+
"model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
22 |
+
"model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
23 |
+
"model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
24 |
+
"model.layers.1.block_sparse_moe.experts.0.w1.weight": "model-00001-of-00004.safetensors",
|
25 |
+
"model.layers.1.block_sparse_moe.experts.0.w2.weight": "model-00001-of-00004.safetensors",
|
26 |
+
"model.layers.1.block_sparse_moe.experts.0.w3.weight": "model-00001-of-00004.safetensors",
|
27 |
+
"model.layers.1.block_sparse_moe.experts.1.w1.weight": "model-00001-of-00004.safetensors",
|
28 |
+
"model.layers.1.block_sparse_moe.experts.1.w2.weight": "model-00001-of-00004.safetensors",
|
29 |
+
"model.layers.1.block_sparse_moe.experts.1.w3.weight": "model-00001-of-00004.safetensors",
|
30 |
+
"model.layers.1.block_sparse_moe.experts.2.w1.weight": "model-00001-of-00004.safetensors",
|
31 |
+
"model.layers.1.block_sparse_moe.experts.2.w2.weight": "model-00001-of-00004.safetensors",
|
32 |
+
"model.layers.1.block_sparse_moe.experts.2.w3.weight": "model-00001-of-00004.safetensors",
|
33 |
+
"model.layers.1.block_sparse_moe.gate.weight": "model-00001-of-00004.safetensors",
|
34 |
+
"model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
35 |
+
"model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
36 |
+
"model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
37 |
+
"model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
38 |
+
"model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
39 |
+
"model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
40 |
+
"model.layers.10.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00004.safetensors",
|
41 |
+
"model.layers.10.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00004.safetensors",
|
42 |
+
"model.layers.10.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00004.safetensors",
|
43 |
+
"model.layers.10.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00004.safetensors",
|
44 |
+
"model.layers.10.block_sparse_moe.experts.1.w2.weight": "model-00002-of-00004.safetensors",
|
45 |
+
"model.layers.10.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00004.safetensors",
|
46 |
+
"model.layers.10.block_sparse_moe.experts.2.w1.weight": "model-00002-of-00004.safetensors",
|
47 |
+
"model.layers.10.block_sparse_moe.experts.2.w2.weight": "model-00002-of-00004.safetensors",
|
48 |
+
"model.layers.10.block_sparse_moe.experts.2.w3.weight": "model-00002-of-00004.safetensors",
|
49 |
+
"model.layers.10.block_sparse_moe.gate.weight": "model-00002-of-00004.safetensors",
|
50 |
+
"model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
51 |
+
"model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
52 |
+
"model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
53 |
+
"model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
54 |
+
"model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
55 |
+
"model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
56 |
+
"model.layers.11.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00004.safetensors",
|
57 |
+
"model.layers.11.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00004.safetensors",
|
58 |
+
"model.layers.11.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00004.safetensors",
|
59 |
+
"model.layers.11.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00004.safetensors",
|
60 |
+
"model.layers.11.block_sparse_moe.experts.1.w2.weight": "model-00002-of-00004.safetensors",
|
61 |
+
"model.layers.11.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00004.safetensors",
|
62 |
+
"model.layers.11.block_sparse_moe.experts.2.w1.weight": "model-00002-of-00004.safetensors",
|
63 |
+
"model.layers.11.block_sparse_moe.experts.2.w2.weight": "model-00002-of-00004.safetensors",
|
64 |
+
"model.layers.11.block_sparse_moe.experts.2.w3.weight": "model-00002-of-00004.safetensors",
|
65 |
+
"model.layers.11.block_sparse_moe.gate.weight": "model-00002-of-00004.safetensors",
|
66 |
+
"model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
67 |
+
"model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
68 |
+
"model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
69 |
+
"model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
70 |
+
"model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
71 |
+
"model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
72 |
+
"model.layers.12.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00004.safetensors",
|
73 |
+
"model.layers.12.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00004.safetensors",
|
74 |
+
"model.layers.12.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00004.safetensors",
|
75 |
+
"model.layers.12.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00004.safetensors",
|
76 |
+
"model.layers.12.block_sparse_moe.experts.1.w2.weight": "model-00002-of-00004.safetensors",
|
77 |
+
"model.layers.12.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00004.safetensors",
|
78 |
+
"model.layers.12.block_sparse_moe.experts.2.w1.weight": "model-00002-of-00004.safetensors",
|
79 |
+
"model.layers.12.block_sparse_moe.experts.2.w2.weight": "model-00002-of-00004.safetensors",
|
80 |
+
"model.layers.12.block_sparse_moe.experts.2.w3.weight": "model-00002-of-00004.safetensors",
|
81 |
+
"model.layers.12.block_sparse_moe.gate.weight": "model-00002-of-00004.safetensors",
|
82 |
+
"model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
83 |
+
"model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
84 |
+
"model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
85 |
+
"model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
86 |
+
"model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
87 |
+
"model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
88 |
+
"model.layers.13.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00004.safetensors",
|
89 |
+
"model.layers.13.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00004.safetensors",
|
90 |
+
"model.layers.13.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00004.safetensors",
|
91 |
+
"model.layers.13.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00004.safetensors",
|
92 |
+
"model.layers.13.block_sparse_moe.experts.1.w2.weight": "model-00002-of-00004.safetensors",
|
93 |
+
"model.layers.13.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00004.safetensors",
|
94 |
+
"model.layers.13.block_sparse_moe.experts.2.w1.weight": "model-00002-of-00004.safetensors",
|
95 |
+
"model.layers.13.block_sparse_moe.experts.2.w2.weight": "model-00002-of-00004.safetensors",
|
96 |
+
"model.layers.13.block_sparse_moe.experts.2.w3.weight": "model-00002-of-00004.safetensors",
|
97 |
+
"model.layers.13.block_sparse_moe.gate.weight": "model-00002-of-00004.safetensors",
|
98 |
+
"model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
99 |
+
"model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
100 |
+
"model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
101 |
+
"model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
102 |
+
"model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
103 |
+
"model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
104 |
+
"model.layers.14.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00004.safetensors",
|
105 |
+
"model.layers.14.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00004.safetensors",
|
106 |
+
"model.layers.14.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00004.safetensors",
|
107 |
+
"model.layers.14.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00004.safetensors",
|
108 |
+
"model.layers.14.block_sparse_moe.experts.1.w2.weight": "model-00002-of-00004.safetensors",
|
109 |
+
"model.layers.14.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00004.safetensors",
|
110 |
+
"model.layers.14.block_sparse_moe.experts.2.w1.weight": "model-00002-of-00004.safetensors",
|
111 |
+
"model.layers.14.block_sparse_moe.experts.2.w2.weight": "model-00002-of-00004.safetensors",
|
112 |
+
"model.layers.14.block_sparse_moe.experts.2.w3.weight": "model-00002-of-00004.safetensors",
|
113 |
+
"model.layers.14.block_sparse_moe.gate.weight": "model-00002-of-00004.safetensors",
|
114 |
+
"model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
115 |
+
"model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
116 |
+
"model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
117 |
+
"model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
118 |
+
"model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
119 |
+
"model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
120 |
+
"model.layers.15.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00004.safetensors",
|
121 |
+
"model.layers.15.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00004.safetensors",
|
122 |
+
"model.layers.15.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00004.safetensors",
|
123 |
+
"model.layers.15.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00004.safetensors",
|
124 |
+
"model.layers.15.block_sparse_moe.experts.1.w2.weight": "model-00002-of-00004.safetensors",
|
125 |
+
"model.layers.15.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00004.safetensors",
|
126 |
+
"model.layers.15.block_sparse_moe.experts.2.w1.weight": "model-00002-of-00004.safetensors",
|
127 |
+
"model.layers.15.block_sparse_moe.experts.2.w2.weight": "model-00002-of-00004.safetensors",
|
128 |
+
"model.layers.15.block_sparse_moe.experts.2.w3.weight": "model-00002-of-00004.safetensors",
|
129 |
+
"model.layers.15.block_sparse_moe.gate.weight": "model-00002-of-00004.safetensors",
|
130 |
+
"model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
131 |
+
"model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
132 |
+
"model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
133 |
+
"model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
134 |
+
"model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
135 |
+
"model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
136 |
+
"model.layers.16.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00004.safetensors",
|
137 |
+
"model.layers.16.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00004.safetensors",
|
138 |
+
"model.layers.16.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00004.safetensors",
|
139 |
+
"model.layers.16.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00004.safetensors",
|
140 |
+
"model.layers.16.block_sparse_moe.experts.1.w2.weight": "model-00002-of-00004.safetensors",
|
141 |
+
"model.layers.16.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00004.safetensors",
|
142 |
+
"model.layers.16.block_sparse_moe.experts.2.w1.weight": "model-00002-of-00004.safetensors",
|
143 |
+
"model.layers.16.block_sparse_moe.experts.2.w2.weight": "model-00002-of-00004.safetensors",
|
144 |
+
"model.layers.16.block_sparse_moe.experts.2.w3.weight": "model-00002-of-00004.safetensors",
|
145 |
+
"model.layers.16.block_sparse_moe.gate.weight": "model-00002-of-00004.safetensors",
|
146 |
+
"model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
147 |
+
"model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
148 |
+
"model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
149 |
+
"model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
150 |
+
"model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
151 |
+
"model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
152 |
+
"model.layers.17.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00004.safetensors",
|
153 |
+
"model.layers.17.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00004.safetensors",
|
154 |
+
"model.layers.17.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00004.safetensors",
|
155 |
+
"model.layers.17.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00004.safetensors",
|
156 |
+
"model.layers.17.block_sparse_moe.experts.1.w2.weight": "model-00002-of-00004.safetensors",
|
157 |
+
"model.layers.17.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00004.safetensors",
|
158 |
+
"model.layers.17.block_sparse_moe.experts.2.w1.weight": "model-00002-of-00004.safetensors",
|
159 |
+
"model.layers.17.block_sparse_moe.experts.2.w2.weight": "model-00002-of-00004.safetensors",
|
160 |
+
"model.layers.17.block_sparse_moe.experts.2.w3.weight": "model-00002-of-00004.safetensors",
|
161 |
+
"model.layers.17.block_sparse_moe.gate.weight": "model-00002-of-00004.safetensors",
|
162 |
+
"model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
163 |
+
"model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
164 |
+
"model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
165 |
+
"model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
166 |
+
"model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
167 |
+
"model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
168 |
+
"model.layers.18.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00004.safetensors",
|
169 |
+
"model.layers.18.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00004.safetensors",
|
170 |
+
"model.layers.18.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00004.safetensors",
|
171 |
+
"model.layers.18.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00004.safetensors",
|
172 |
+
"model.layers.18.block_sparse_moe.experts.1.w2.weight": "model-00003-of-00004.safetensors",
|
173 |
+
"model.layers.18.block_sparse_moe.experts.1.w3.weight": "model-00003-of-00004.safetensors",
|
174 |
+
"model.layers.18.block_sparse_moe.experts.2.w1.weight": "model-00003-of-00004.safetensors",
|
175 |
+
"model.layers.18.block_sparse_moe.experts.2.w2.weight": "model-00003-of-00004.safetensors",
|
176 |
+
"model.layers.18.block_sparse_moe.experts.2.w3.weight": "model-00003-of-00004.safetensors",
|
177 |
+
"model.layers.18.block_sparse_moe.gate.weight": "model-00002-of-00004.safetensors",
|
178 |
+
"model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
179 |
+
"model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
180 |
+
"model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
181 |
+
"model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
182 |
+
"model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
183 |
+
"model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
184 |
+
"model.layers.19.block_sparse_moe.experts.0.w1.weight": "model-00003-of-00004.safetensors",
|
185 |
+
"model.layers.19.block_sparse_moe.experts.0.w2.weight": "model-00003-of-00004.safetensors",
|
186 |
+
"model.layers.19.block_sparse_moe.experts.0.w3.weight": "model-00003-of-00004.safetensors",
|
187 |
+
"model.layers.19.block_sparse_moe.experts.1.w1.weight": "model-00003-of-00004.safetensors",
|
188 |
+
"model.layers.19.block_sparse_moe.experts.1.w2.weight": "model-00003-of-00004.safetensors",
|
189 |
+
"model.layers.19.block_sparse_moe.experts.1.w3.weight": "model-00003-of-00004.safetensors",
|
190 |
+
"model.layers.19.block_sparse_moe.experts.2.w1.weight": "model-00003-of-00004.safetensors",
|
191 |
+
"model.layers.19.block_sparse_moe.experts.2.w2.weight": "model-00003-of-00004.safetensors",
|
192 |
+
"model.layers.19.block_sparse_moe.experts.2.w3.weight": "model-00003-of-00004.safetensors",
|
193 |
+
"model.layers.19.block_sparse_moe.gate.weight": "model-00003-of-00004.safetensors",
|
194 |
+
"model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
195 |
+
"model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
196 |
+
"model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
197 |
+
"model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
198 |
+
"model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
199 |
+
"model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
200 |
+
"model.layers.2.block_sparse_moe.experts.0.w1.weight": "model-00001-of-00004.safetensors",
|
201 |
+
"model.layers.2.block_sparse_moe.experts.0.w2.weight": "model-00001-of-00004.safetensors",
|
202 |
+
"model.layers.2.block_sparse_moe.experts.0.w3.weight": "model-00001-of-00004.safetensors",
|
203 |
+
"model.layers.2.block_sparse_moe.experts.1.w1.weight": "model-00001-of-00004.safetensors",
|
204 |
+
"model.layers.2.block_sparse_moe.experts.1.w2.weight": "model-00001-of-00004.safetensors",
|
205 |
+
"model.layers.2.block_sparse_moe.experts.1.w3.weight": "model-00001-of-00004.safetensors",
|
206 |
+
"model.layers.2.block_sparse_moe.experts.2.w1.weight": "model-00001-of-00004.safetensors",
|
207 |
+
"model.layers.2.block_sparse_moe.experts.2.w2.weight": "model-00001-of-00004.safetensors",
|
208 |
+
"model.layers.2.block_sparse_moe.experts.2.w3.weight": "model-00001-of-00004.safetensors",
|
209 |
+
"model.layers.2.block_sparse_moe.gate.weight": "model-00001-of-00004.safetensors",
|
210 |
+
"model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
211 |
+
"model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
212 |
+
"model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
213 |
+
"model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
214 |
+
"model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
215 |
+
"model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
216 |
+
"model.layers.20.block_sparse_moe.experts.0.w1.weight": "model-00003-of-00004.safetensors",
|
217 |
+
"model.layers.20.block_sparse_moe.experts.0.w2.weight": "model-00003-of-00004.safetensors",
|
218 |
+
"model.layers.20.block_sparse_moe.experts.0.w3.weight": "model-00003-of-00004.safetensors",
|
219 |
+
"model.layers.20.block_sparse_moe.experts.1.w1.weight": "model-00003-of-00004.safetensors",
|
220 |
+
"model.layers.20.block_sparse_moe.experts.1.w2.weight": "model-00003-of-00004.safetensors",
|
221 |
+
"model.layers.20.block_sparse_moe.experts.1.w3.weight": "model-00003-of-00004.safetensors",
|
222 |
+
"model.layers.20.block_sparse_moe.experts.2.w1.weight": "model-00003-of-00004.safetensors",
|
223 |
+
"model.layers.20.block_sparse_moe.experts.2.w2.weight": "model-00003-of-00004.safetensors",
|
224 |
+
"model.layers.20.block_sparse_moe.experts.2.w3.weight": "model-00003-of-00004.safetensors",
|
225 |
+
"model.layers.20.block_sparse_moe.gate.weight": "model-00003-of-00004.safetensors",
|
226 |
+
"model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
227 |
+
"model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
228 |
+
"model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
229 |
+
"model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
230 |
+
"model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
231 |
+
"model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
232 |
+
"model.layers.21.block_sparse_moe.experts.0.w1.weight": "model-00003-of-00004.safetensors",
|
233 |
+
"model.layers.21.block_sparse_moe.experts.0.w2.weight": "model-00003-of-00004.safetensors",
|
234 |
+
"model.layers.21.block_sparse_moe.experts.0.w3.weight": "model-00003-of-00004.safetensors",
|
235 |
+
"model.layers.21.block_sparse_moe.experts.1.w1.weight": "model-00003-of-00004.safetensors",
|
236 |
+
"model.layers.21.block_sparse_moe.experts.1.w2.weight": "model-00003-of-00004.safetensors",
|
237 |
+
"model.layers.21.block_sparse_moe.experts.1.w3.weight": "model-00003-of-00004.safetensors",
|
238 |
+
"model.layers.21.block_sparse_moe.experts.2.w1.weight": "model-00003-of-00004.safetensors",
|
239 |
+
"model.layers.21.block_sparse_moe.experts.2.w2.weight": "model-00003-of-00004.safetensors",
|
240 |
+
"model.layers.21.block_sparse_moe.experts.2.w3.weight": "model-00003-of-00004.safetensors",
|
241 |
+
"model.layers.21.block_sparse_moe.gate.weight": "model-00003-of-00004.safetensors",
|
242 |
+
"model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
243 |
+
"model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
244 |
+
"model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
245 |
+
"model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
246 |
+
"model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
247 |
+
"model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
248 |
+
"model.layers.22.block_sparse_moe.experts.0.w1.weight": "model-00003-of-00004.safetensors",
|
249 |
+
"model.layers.22.block_sparse_moe.experts.0.w2.weight": "model-00003-of-00004.safetensors",
|
250 |
+
"model.layers.22.block_sparse_moe.experts.0.w3.weight": "model-00003-of-00004.safetensors",
|
251 |
+
"model.layers.22.block_sparse_moe.experts.1.w1.weight": "model-00003-of-00004.safetensors",
|
252 |
+
"model.layers.22.block_sparse_moe.experts.1.w2.weight": "model-00003-of-00004.safetensors",
|
253 |
+
"model.layers.22.block_sparse_moe.experts.1.w3.weight": "model-00003-of-00004.safetensors",
|
254 |
+
"model.layers.22.block_sparse_moe.experts.2.w1.weight": "model-00003-of-00004.safetensors",
|
255 |
+
"model.layers.22.block_sparse_moe.experts.2.w2.weight": "model-00003-of-00004.safetensors",
|
256 |
+
"model.layers.22.block_sparse_moe.experts.2.w3.weight": "model-00003-of-00004.safetensors",
|
257 |
+
"model.layers.22.block_sparse_moe.gate.weight": "model-00003-of-00004.safetensors",
|
258 |
+
"model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
259 |
+
"model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
260 |
+
"model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
261 |
+
"model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
262 |
+
"model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
263 |
+
"model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
264 |
+
"model.layers.23.block_sparse_moe.experts.0.w1.weight": "model-00003-of-00004.safetensors",
|
265 |
+
"model.layers.23.block_sparse_moe.experts.0.w2.weight": "model-00003-of-00004.safetensors",
|
266 |
+
"model.layers.23.block_sparse_moe.experts.0.w3.weight": "model-00003-of-00004.safetensors",
|
267 |
+
"model.layers.23.block_sparse_moe.experts.1.w1.weight": "model-00003-of-00004.safetensors",
|
268 |
+
"model.layers.23.block_sparse_moe.experts.1.w2.weight": "model-00003-of-00004.safetensors",
|
269 |
+
"model.layers.23.block_sparse_moe.experts.1.w3.weight": "model-00003-of-00004.safetensors",
|
270 |
+
"model.layers.23.block_sparse_moe.experts.2.w1.weight": "model-00003-of-00004.safetensors",
|
271 |
+
"model.layers.23.block_sparse_moe.experts.2.w2.weight": "model-00003-of-00004.safetensors",
|
272 |
+
"model.layers.23.block_sparse_moe.experts.2.w3.weight": "model-00003-of-00004.safetensors",
|
273 |
+
"model.layers.23.block_sparse_moe.gate.weight": "model-00003-of-00004.safetensors",
|
274 |
+
"model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
275 |
+
"model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
276 |
+
"model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
277 |
+
"model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
278 |
+
"model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
279 |
+
"model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
280 |
+
"model.layers.24.block_sparse_moe.experts.0.w1.weight": "model-00003-of-00004.safetensors",
|
281 |
+
"model.layers.24.block_sparse_moe.experts.0.w2.weight": "model-00003-of-00004.safetensors",
|
282 |
+
"model.layers.24.block_sparse_moe.experts.0.w3.weight": "model-00003-of-00004.safetensors",
|
283 |
+
"model.layers.24.block_sparse_moe.experts.1.w1.weight": "model-00003-of-00004.safetensors",
|
284 |
+
"model.layers.24.block_sparse_moe.experts.1.w2.weight": "model-00003-of-00004.safetensors",
|
285 |
+
"model.layers.24.block_sparse_moe.experts.1.w3.weight": "model-00003-of-00004.safetensors",
|
286 |
+
"model.layers.24.block_sparse_moe.experts.2.w1.weight": "model-00003-of-00004.safetensors",
|
287 |
+
"model.layers.24.block_sparse_moe.experts.2.w2.weight": "model-00003-of-00004.safetensors",
|
288 |
+
"model.layers.24.block_sparse_moe.experts.2.w3.weight": "model-00003-of-00004.safetensors",
|
289 |
+
"model.layers.24.block_sparse_moe.gate.weight": "model-00003-of-00004.safetensors",
|
290 |
+
"model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
291 |
+
"model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
292 |
+
"model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
293 |
+
"model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
294 |
+
"model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
295 |
+
"model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
296 |
+
"model.layers.25.block_sparse_moe.experts.0.w1.weight": "model-00003-of-00004.safetensors",
|
297 |
+
"model.layers.25.block_sparse_moe.experts.0.w2.weight": "model-00003-of-00004.safetensors",
|
298 |
+
"model.layers.25.block_sparse_moe.experts.0.w3.weight": "model-00003-of-00004.safetensors",
|
299 |
+
"model.layers.25.block_sparse_moe.experts.1.w1.weight": "model-00003-of-00004.safetensors",
|
300 |
+
"model.layers.25.block_sparse_moe.experts.1.w2.weight": "model-00003-of-00004.safetensors",
|
301 |
+
"model.layers.25.block_sparse_moe.experts.1.w3.weight": "model-00003-of-00004.safetensors",
|
302 |
+
"model.layers.25.block_sparse_moe.experts.2.w1.weight": "model-00003-of-00004.safetensors",
|
303 |
+
"model.layers.25.block_sparse_moe.experts.2.w2.weight": "model-00003-of-00004.safetensors",
|
304 |
+
"model.layers.25.block_sparse_moe.experts.2.w3.weight": "model-00003-of-00004.safetensors",
|
305 |
+
"model.layers.25.block_sparse_moe.gate.weight": "model-00003-of-00004.safetensors",
|
306 |
+
"model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
307 |
+
"model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
308 |
+
"model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
309 |
+
"model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
310 |
+
"model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
311 |
+
"model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
312 |
+
"model.layers.26.block_sparse_moe.experts.0.w1.weight": "model-00003-of-00004.safetensors",
|
313 |
+
"model.layers.26.block_sparse_moe.experts.0.w2.weight": "model-00003-of-00004.safetensors",
|
314 |
+
"model.layers.26.block_sparse_moe.experts.0.w3.weight": "model-00003-of-00004.safetensors",
|
315 |
+
"model.layers.26.block_sparse_moe.experts.1.w1.weight": "model-00003-of-00004.safetensors",
|
316 |
+
"model.layers.26.block_sparse_moe.experts.1.w2.weight": "model-00003-of-00004.safetensors",
|
317 |
+
"model.layers.26.block_sparse_moe.experts.1.w3.weight": "model-00003-of-00004.safetensors",
|
318 |
+
"model.layers.26.block_sparse_moe.experts.2.w1.weight": "model-00003-of-00004.safetensors",
|
319 |
+
"model.layers.26.block_sparse_moe.experts.2.w2.weight": "model-00003-of-00004.safetensors",
|
320 |
+
"model.layers.26.block_sparse_moe.experts.2.w3.weight": "model-00003-of-00004.safetensors",
|
321 |
+
"model.layers.26.block_sparse_moe.gate.weight": "model-00003-of-00004.safetensors",
|
322 |
+
"model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
323 |
+
"model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
324 |
+
"model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
325 |
+
"model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
326 |
+
"model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
327 |
+
"model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
328 |
+
"model.layers.27.block_sparse_moe.experts.0.w1.weight": "model-00003-of-00004.safetensors",
|
329 |
+
"model.layers.27.block_sparse_moe.experts.0.w2.weight": "model-00003-of-00004.safetensors",
|
330 |
+
"model.layers.27.block_sparse_moe.experts.0.w3.weight": "model-00003-of-00004.safetensors",
|
331 |
+
"model.layers.27.block_sparse_moe.experts.1.w1.weight": "model-00003-of-00004.safetensors",
|
332 |
+
"model.layers.27.block_sparse_moe.experts.1.w2.weight": "model-00003-of-00004.safetensors",
|
333 |
+
"model.layers.27.block_sparse_moe.experts.1.w3.weight": "model-00003-of-00004.safetensors",
|
334 |
+
"model.layers.27.block_sparse_moe.experts.2.w1.weight": "model-00003-of-00004.safetensors",
|
335 |
+
"model.layers.27.block_sparse_moe.experts.2.w2.weight": "model-00003-of-00004.safetensors",
|
336 |
+
"model.layers.27.block_sparse_moe.experts.2.w3.weight": "model-00004-of-00004.safetensors",
|
337 |
+
"model.layers.27.block_sparse_moe.gate.weight": "model-00003-of-00004.safetensors",
|
338 |
+
"model.layers.27.input_layernorm.weight": "model-00004-of-00004.safetensors",
|
339 |
+
"model.layers.27.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
|
340 |
+
"model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
341 |
+
"model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
342 |
+
"model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
343 |
+
"model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
344 |
+
"model.layers.28.block_sparse_moe.experts.0.w1.weight": "model-00004-of-00004.safetensors",
|
345 |
+
"model.layers.28.block_sparse_moe.experts.0.w2.weight": "model-00004-of-00004.safetensors",
|
346 |
+
"model.layers.28.block_sparse_moe.experts.0.w3.weight": "model-00004-of-00004.safetensors",
|
347 |
+
"model.layers.28.block_sparse_moe.experts.1.w1.weight": "model-00004-of-00004.safetensors",
|
348 |
+
"model.layers.28.block_sparse_moe.experts.1.w2.weight": "model-00004-of-00004.safetensors",
|
349 |
+
"model.layers.28.block_sparse_moe.experts.1.w3.weight": "model-00004-of-00004.safetensors",
|
350 |
+
"model.layers.28.block_sparse_moe.experts.2.w1.weight": "model-00004-of-00004.safetensors",
|
351 |
+
"model.layers.28.block_sparse_moe.experts.2.w2.weight": "model-00004-of-00004.safetensors",
|
352 |
+
"model.layers.28.block_sparse_moe.experts.2.w3.weight": "model-00004-of-00004.safetensors",
|
353 |
+
"model.layers.28.block_sparse_moe.gate.weight": "model-00004-of-00004.safetensors",
|
354 |
+
"model.layers.28.input_layernorm.weight": "model-00004-of-00004.safetensors",
|
355 |
+
"model.layers.28.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
|
356 |
+
"model.layers.28.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
|
357 |
+
"model.layers.28.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
|
358 |
+
"model.layers.28.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
|
359 |
+
"model.layers.28.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
|
360 |
+
"model.layers.29.block_sparse_moe.experts.0.w1.weight": "model-00004-of-00004.safetensors",
|
361 |
+
"model.layers.29.block_sparse_moe.experts.0.w2.weight": "model-00004-of-00004.safetensors",
|
362 |
+
"model.layers.29.block_sparse_moe.experts.0.w3.weight": "model-00004-of-00004.safetensors",
|
363 |
+
"model.layers.29.block_sparse_moe.experts.1.w1.weight": "model-00004-of-00004.safetensors",
|
364 |
+
"model.layers.29.block_sparse_moe.experts.1.w2.weight": "model-00004-of-00004.safetensors",
|
365 |
+
"model.layers.29.block_sparse_moe.experts.1.w3.weight": "model-00004-of-00004.safetensors",
|
366 |
+
"model.layers.29.block_sparse_moe.experts.2.w1.weight": "model-00004-of-00004.safetensors",
|
367 |
+
"model.layers.29.block_sparse_moe.experts.2.w2.weight": "model-00004-of-00004.safetensors",
|
368 |
+
"model.layers.29.block_sparse_moe.experts.2.w3.weight": "model-00004-of-00004.safetensors",
|
369 |
+
"model.layers.29.block_sparse_moe.gate.weight": "model-00004-of-00004.safetensors",
|
370 |
+
"model.layers.29.input_layernorm.weight": "model-00004-of-00004.safetensors",
|
371 |
+
"model.layers.29.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
|
372 |
+
"model.layers.29.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
|
373 |
+
"model.layers.29.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
|
374 |
+
"model.layers.29.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
|
375 |
+
"model.layers.29.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
|
376 |
+
"model.layers.3.block_sparse_moe.experts.0.w1.weight": "model-00001-of-00004.safetensors",
|
377 |
+
"model.layers.3.block_sparse_moe.experts.0.w2.weight": "model-00001-of-00004.safetensors",
|
378 |
+
"model.layers.3.block_sparse_moe.experts.0.w3.weight": "model-00001-of-00004.safetensors",
|
379 |
+
"model.layers.3.block_sparse_moe.experts.1.w1.weight": "model-00001-of-00004.safetensors",
|
380 |
+
"model.layers.3.block_sparse_moe.experts.1.w2.weight": "model-00001-of-00004.safetensors",
|
381 |
+
"model.layers.3.block_sparse_moe.experts.1.w3.weight": "model-00001-of-00004.safetensors",
|
382 |
+
"model.layers.3.block_sparse_moe.experts.2.w1.weight": "model-00001-of-00004.safetensors",
|
383 |
+
"model.layers.3.block_sparse_moe.experts.2.w2.weight": "model-00001-of-00004.safetensors",
|
384 |
+
"model.layers.3.block_sparse_moe.experts.2.w3.weight": "model-00001-of-00004.safetensors",
|
385 |
+
"model.layers.3.block_sparse_moe.gate.weight": "model-00001-of-00004.safetensors",
|
386 |
+
"model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
387 |
+
"model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
388 |
+
"model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
389 |
+
"model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
390 |
+
"model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
391 |
+
"model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
392 |
+
"model.layers.30.block_sparse_moe.experts.0.w1.weight": "model-00004-of-00004.safetensors",
|
393 |
+
"model.layers.30.block_sparse_moe.experts.0.w2.weight": "model-00004-of-00004.safetensors",
|
394 |
+
"model.layers.30.block_sparse_moe.experts.0.w3.weight": "model-00004-of-00004.safetensors",
|
395 |
+
"model.layers.30.block_sparse_moe.experts.1.w1.weight": "model-00004-of-00004.safetensors",
|
396 |
+
"model.layers.30.block_sparse_moe.experts.1.w2.weight": "model-00004-of-00004.safetensors",
|
397 |
+
"model.layers.30.block_sparse_moe.experts.1.w3.weight": "model-00004-of-00004.safetensors",
|
398 |
+
"model.layers.30.block_sparse_moe.experts.2.w1.weight": "model-00004-of-00004.safetensors",
|
399 |
+
"model.layers.30.block_sparse_moe.experts.2.w2.weight": "model-00004-of-00004.safetensors",
|
400 |
+
"model.layers.30.block_sparse_moe.experts.2.w3.weight": "model-00004-of-00004.safetensors",
|
401 |
+
"model.layers.30.block_sparse_moe.gate.weight": "model-00004-of-00004.safetensors",
|
402 |
+
"model.layers.30.input_layernorm.weight": "model-00004-of-00004.safetensors",
|
403 |
+
"model.layers.30.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
|
404 |
+
"model.layers.30.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
|
405 |
+
"model.layers.30.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
|
406 |
+
"model.layers.30.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
|
407 |
+
"model.layers.30.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
|
408 |
+
"model.layers.31.block_sparse_moe.experts.0.w1.weight": "model-00004-of-00004.safetensors",
|
409 |
+
"model.layers.31.block_sparse_moe.experts.0.w2.weight": "model-00004-of-00004.safetensors",
|
410 |
+
"model.layers.31.block_sparse_moe.experts.0.w3.weight": "model-00004-of-00004.safetensors",
|
411 |
+
"model.layers.31.block_sparse_moe.experts.1.w1.weight": "model-00004-of-00004.safetensors",
|
412 |
+
"model.layers.31.block_sparse_moe.experts.1.w2.weight": "model-00004-of-00004.safetensors",
|
413 |
+
"model.layers.31.block_sparse_moe.experts.1.w3.weight": "model-00004-of-00004.safetensors",
|
414 |
+
"model.layers.31.block_sparse_moe.experts.2.w1.weight": "model-00004-of-00004.safetensors",
|
415 |
+
"model.layers.31.block_sparse_moe.experts.2.w2.weight": "model-00004-of-00004.safetensors",
|
416 |
+
"model.layers.31.block_sparse_moe.experts.2.w3.weight": "model-00004-of-00004.safetensors",
|
417 |
+
"model.layers.31.block_sparse_moe.gate.weight": "model-00004-of-00004.safetensors",
|
418 |
+
"model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors",
|
419 |
+
"model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
|
420 |
+
"model.layers.31.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
|
421 |
+
"model.layers.31.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
|
422 |
+
"model.layers.31.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
|
423 |
+
"model.layers.31.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
|
424 |
+
"model.layers.4.block_sparse_moe.experts.0.w1.weight": "model-00001-of-00004.safetensors",
|
425 |
+
"model.layers.4.block_sparse_moe.experts.0.w2.weight": "model-00001-of-00004.safetensors",
|
426 |
+
"model.layers.4.block_sparse_moe.experts.0.w3.weight": "model-00001-of-00004.safetensors",
|
427 |
+
"model.layers.4.block_sparse_moe.experts.1.w1.weight": "model-00001-of-00004.safetensors",
|
428 |
+
"model.layers.4.block_sparse_moe.experts.1.w2.weight": "model-00001-of-00004.safetensors",
|
429 |
+
"model.layers.4.block_sparse_moe.experts.1.w3.weight": "model-00001-of-00004.safetensors",
|
430 |
+
"model.layers.4.block_sparse_moe.experts.2.w1.weight": "model-00001-of-00004.safetensors",
|
431 |
+
"model.layers.4.block_sparse_moe.experts.2.w2.weight": "model-00001-of-00004.safetensors",
|
432 |
+
"model.layers.4.block_sparse_moe.experts.2.w3.weight": "model-00001-of-00004.safetensors",
|
433 |
+
"model.layers.4.block_sparse_moe.gate.weight": "model-00001-of-00004.safetensors",
|
434 |
+
"model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
435 |
+
"model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
436 |
+
"model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
437 |
+
"model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
438 |
+
"model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
439 |
+
"model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
440 |
+
"model.layers.5.block_sparse_moe.experts.0.w1.weight": "model-00001-of-00004.safetensors",
|
441 |
+
"model.layers.5.block_sparse_moe.experts.0.w2.weight": "model-00001-of-00004.safetensors",
|
442 |
+
"model.layers.5.block_sparse_moe.experts.0.w3.weight": "model-00001-of-00004.safetensors",
|
443 |
+
"model.layers.5.block_sparse_moe.experts.1.w1.weight": "model-00001-of-00004.safetensors",
|
444 |
+
"model.layers.5.block_sparse_moe.experts.1.w2.weight": "model-00001-of-00004.safetensors",
|
445 |
+
"model.layers.5.block_sparse_moe.experts.1.w3.weight": "model-00001-of-00004.safetensors",
|
446 |
+
"model.layers.5.block_sparse_moe.experts.2.w1.weight": "model-00001-of-00004.safetensors",
|
447 |
+
"model.layers.5.block_sparse_moe.experts.2.w2.weight": "model-00001-of-00004.safetensors",
|
448 |
+
"model.layers.5.block_sparse_moe.experts.2.w3.weight": "model-00001-of-00004.safetensors",
|
449 |
+
"model.layers.5.block_sparse_moe.gate.weight": "model-00001-of-00004.safetensors",
|
450 |
+
"model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
451 |
+
"model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
452 |
+
"model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
453 |
+
"model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
454 |
+
"model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
455 |
+
"model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
456 |
+
"model.layers.6.block_sparse_moe.experts.0.w1.weight": "model-00001-of-00004.safetensors",
|
457 |
+
"model.layers.6.block_sparse_moe.experts.0.w2.weight": "model-00001-of-00004.safetensors",
|
458 |
+
"model.layers.6.block_sparse_moe.experts.0.w3.weight": "model-00001-of-00004.safetensors",
|
459 |
+
"model.layers.6.block_sparse_moe.experts.1.w1.weight": "model-00001-of-00004.safetensors",
|
460 |
+
"model.layers.6.block_sparse_moe.experts.1.w2.weight": "model-00001-of-00004.safetensors",
|
461 |
+
"model.layers.6.block_sparse_moe.experts.1.w3.weight": "model-00001-of-00004.safetensors",
|
462 |
+
"model.layers.6.block_sparse_moe.experts.2.w1.weight": "model-00001-of-00004.safetensors",
|
463 |
+
"model.layers.6.block_sparse_moe.experts.2.w2.weight": "model-00001-of-00004.safetensors",
|
464 |
+
"model.layers.6.block_sparse_moe.experts.2.w3.weight": "model-00001-of-00004.safetensors",
|
465 |
+
"model.layers.6.block_sparse_moe.gate.weight": "model-00001-of-00004.safetensors",
|
466 |
+
"model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
467 |
+
"model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
468 |
+
"model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
469 |
+
"model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
470 |
+
"model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
471 |
+
"model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
472 |
+
"model.layers.7.block_sparse_moe.experts.0.w1.weight": "model-00001-of-00004.safetensors",
|
473 |
+
"model.layers.7.block_sparse_moe.experts.0.w2.weight": "model-00001-of-00004.safetensors",
|
474 |
+
"model.layers.7.block_sparse_moe.experts.0.w3.weight": "model-00001-of-00004.safetensors",
|
475 |
+
"model.layers.7.block_sparse_moe.experts.1.w1.weight": "model-00001-of-00004.safetensors",
|
476 |
+
"model.layers.7.block_sparse_moe.experts.1.w2.weight": "model-00001-of-00004.safetensors",
|
477 |
+
"model.layers.7.block_sparse_moe.experts.1.w3.weight": "model-00001-of-00004.safetensors",
|
478 |
+
"model.layers.7.block_sparse_moe.experts.2.w1.weight": "model-00001-of-00004.safetensors",
|
479 |
+
"model.layers.7.block_sparse_moe.experts.2.w2.weight": "model-00001-of-00004.safetensors",
|
480 |
+
"model.layers.7.block_sparse_moe.experts.2.w3.weight": "model-00001-of-00004.safetensors",
|
481 |
+
"model.layers.7.block_sparse_moe.gate.weight": "model-00001-of-00004.safetensors",
|
482 |
+
"model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
483 |
+
"model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
484 |
+
"model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
485 |
+
"model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
486 |
+
"model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
487 |
+
"model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
488 |
+
"model.layers.8.block_sparse_moe.experts.0.w1.weight": "model-00001-of-00004.safetensors",
|
489 |
+
"model.layers.8.block_sparse_moe.experts.0.w2.weight": "model-00001-of-00004.safetensors",
|
490 |
+
"model.layers.8.block_sparse_moe.experts.0.w3.weight": "model-00001-of-00004.safetensors",
|
491 |
+
"model.layers.8.block_sparse_moe.experts.1.w1.weight": "model-00001-of-00004.safetensors",
|
492 |
+
"model.layers.8.block_sparse_moe.experts.1.w2.weight": "model-00001-of-00004.safetensors",
|
493 |
+
"model.layers.8.block_sparse_moe.experts.1.w3.weight": "model-00001-of-00004.safetensors",
|
494 |
+
"model.layers.8.block_sparse_moe.experts.2.w1.weight": "model-00001-of-00004.safetensors",
|
495 |
+
"model.layers.8.block_sparse_moe.experts.2.w2.weight": "model-00001-of-00004.safetensors",
|
496 |
+
"model.layers.8.block_sparse_moe.experts.2.w3.weight": "model-00001-of-00004.safetensors",
|
497 |
+
"model.layers.8.block_sparse_moe.gate.weight": "model-00001-of-00004.safetensors",
|
498 |
+
"model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
499 |
+
"model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
500 |
+
"model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
501 |
+
"model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
502 |
+
"model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
503 |
+
"model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
504 |
+
"model.layers.9.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00004.safetensors",
|
505 |
+
"model.layers.9.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00004.safetensors",
|
506 |
+
"model.layers.9.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00004.safetensors",
|
507 |
+
"model.layers.9.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00004.safetensors",
|
508 |
+
"model.layers.9.block_sparse_moe.experts.1.w2.weight": "model-00002-of-00004.safetensors",
|
509 |
+
"model.layers.9.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00004.safetensors",
|
510 |
+
"model.layers.9.block_sparse_moe.experts.2.w1.weight": "model-00002-of-00004.safetensors",
|
511 |
+
"model.layers.9.block_sparse_moe.experts.2.w2.weight": "model-00002-of-00004.safetensors",
|
512 |
+
"model.layers.9.block_sparse_moe.experts.2.w3.weight": "model-00002-of-00004.safetensors",
|
513 |
+
"model.layers.9.block_sparse_moe.gate.weight": "model-00002-of-00004.safetensors",
|
514 |
+
"model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
515 |
+
"model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
516 |
+
"model.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
517 |
+
"model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
518 |
+
"model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
519 |
+
"model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
520 |
+
"model.norm.weight": "model-00004-of-00004.safetensors"
|
521 |
+
}
|
522 |
+
}
|