Training in progress, epoch 2
Browse files- pytorch_model.bin +1 -1
- train_factual_consistency.ipynb +7 -37
pytorch_model.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 274752173
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7d9fe1fd2b8ff6706c8464c30b35de48c417dea60cb57eee31281e0106c6005a
|
3 |
size 274752173
|
train_factual_consistency.ipynb
CHANGED
@@ -9,7 +9,7 @@
|
|
9 |
{
|
10 |
"data": {
|
11 |
"application/vnd.jupyter.widget-view+json": {
|
12 |
-
"model_id": "
|
13 |
"version_major": 2,
|
14 |
"version_minor": 0
|
15 |
},
|
@@ -81,7 +81,7 @@
|
|
81 |
{
|
82 |
"data": {
|
83 |
"application/vnd.jupyter.widget-view+json": {
|
84 |
-
"model_id": "
|
85 |
"version_major": 2,
|
86 |
"version_minor": 0
|
87 |
},
|
@@ -103,7 +103,7 @@
|
|
103 |
{
|
104 |
"data": {
|
105 |
"application/vnd.jupyter.widget-view+json": {
|
106 |
-
"model_id": "
|
107 |
"version_major": 2,
|
108 |
"version_minor": 0
|
109 |
},
|
@@ -125,7 +125,7 @@
|
|
125 |
},
|
126 |
{
|
127 |
"cell_type": "code",
|
128 |
-
"execution_count":
|
129 |
"id": "6bc83d4c-378c-4313-b641-8ead0c02f715",
|
130 |
"metadata": {},
|
131 |
"outputs": [
|
@@ -144,8 +144,8 @@
|
|
144 |
"\n",
|
145 |
" <div>\n",
|
146 |
" \n",
|
147 |
-
" <progress value='
|
148 |
-
" [
|
149 |
" </div>\n",
|
150 |
" <table border=\"1\" class=\"dataframe\">\n",
|
151 |
" <thead>\n",
|
@@ -159,22 +159,7 @@
|
|
159 |
" <tr>\n",
|
160 |
" <td>1</td>\n",
|
161 |
" <td>No log</td>\n",
|
162 |
-
" <td
|
163 |
-
" </tr>\n",
|
164 |
-
" <tr>\n",
|
165 |
-
" <td>2</td>\n",
|
166 |
-
" <td>-2.673600</td>\n",
|
167 |
-
" <td>-7.069220</td>\n",
|
168 |
-
" </tr>\n",
|
169 |
-
" <tr>\n",
|
170 |
-
" <td>3</td>\n",
|
171 |
-
" <td>-2.673600</td>\n",
|
172 |
-
" <td>-11.083688</td>\n",
|
173 |
-
" </tr>\n",
|
174 |
-
" <tr>\n",
|
175 |
-
" <td>4</td>\n",
|
176 |
-
" <td>-8.789900</td>\n",
|
177 |
-
" <td>-15.529228</td>\n",
|
178 |
" </tr>\n",
|
179 |
" </tbody>\n",
|
180 |
"</table><p>"
|
@@ -185,21 +170,6 @@
|
|
185 |
},
|
186 |
"metadata": {},
|
187 |
"output_type": "display_data"
|
188 |
-
},
|
189 |
-
{
|
190 |
-
"ename": "KeyboardInterrupt",
|
191 |
-
"evalue": "",
|
192 |
-
"output_type": "error",
|
193 |
-
"traceback": [
|
194 |
-
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
195 |
-
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
196 |
-
"Cell \u001b[0;32mIn[4], line 27\u001b[0m\n\u001b[1;32m 17\u001b[0m data_collator \u001b[38;5;241m=\u001b[39m DataCollatorWithPadding(tokenizer\u001b[38;5;241m=\u001b[39mtokenizer)\n\u001b[1;32m 18\u001b[0m trainer \u001b[38;5;241m=\u001b[39m Trainer(\n\u001b[1;32m 19\u001b[0m model\u001b[38;5;241m=\u001b[39mmodel,\n\u001b[1;32m 20\u001b[0m args\u001b[38;5;241m=\u001b[39mtraining_args,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 24\u001b[0m data_collator\u001b[38;5;241m=\u001b[39mdata_collator,\n\u001b[1;32m 25\u001b[0m )\n\u001b[0;32m---> 27\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 28\u001b[0m trainer\u001b[38;5;241m.\u001b[39mpush_to_hub(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mfactual-consistency-regression-ja\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
|
197 |
-
"File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1582\u001b[0m, in \u001b[0;36mTrainer.train\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m 1579\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1580\u001b[0m \u001b[38;5;66;03m# Disable progress bars when uploading models during checkpoints to avoid polluting stdout\u001b[39;00m\n\u001b[1;32m 1581\u001b[0m hf_hub_utils\u001b[38;5;241m.\u001b[39mdisable_progress_bars()\n\u001b[0;32m-> 1582\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1583\u001b[0m \u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1584\u001b[0m \u001b[43m \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1585\u001b[0m \u001b[43m \u001b[49m\u001b[43mtrial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1586\u001b[0m \u001b[43m \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1587\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1588\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 1589\u001b[0m hf_hub_utils\u001b[38;5;241m.\u001b[39menable_progress_bars()\n",
|
198 |
-
"File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1950\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m 1945\u001b[0m nn\u001b[38;5;241m.\u001b[39mutils\u001b[38;5;241m.\u001b[39mclip_grad_norm_(\n\u001b[1;32m 1946\u001b[0m amp\u001b[38;5;241m.\u001b[39mmaster_params(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptimizer),\n\u001b[1;32m 1947\u001b[0m args\u001b[38;5;241m.\u001b[39mmax_grad_norm,\n\u001b[1;32m 1948\u001b[0m )\n\u001b[1;32m 1949\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1950\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43maccelerator\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mclip_grad_norm_\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1951\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparameters\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1952\u001b[0m \u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmax_grad_norm\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1953\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1955\u001b[0m \u001b[38;5;66;03m# Optimizer step\u001b[39;00m\n\u001b[1;32m 1956\u001b[0m optimizer_was_run \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
|
199 |
-
"File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/accelerate/accelerator.py:2121\u001b[0m, in \u001b[0;36mAccelerator.clip_grad_norm_\u001b[0;34m(self, parameters, max_norm, norm_type)\u001b[0m\n\u001b[1;32m 2119\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 2120\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39munscale_gradients()\n\u001b[0;32m-> 2121\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mutils\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mclip_grad_norm_\u001b[49m\u001b[43m(\u001b[49m\u001b[43mparameters\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmax_norm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnorm_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnorm_type\u001b[49m\u001b[43m)\u001b[49m\n",
|
200 |
-
"File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch_xla/_patched_functions.py:49\u001b[0m, in \u001b[0;36mclip_grad_norm_\u001b[0;34m(parameters, max_norm, norm_type, error_if_nonfinite, foreach)\u001b[0m\n\u001b[1;32m 44\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m error_if_nonfinite \u001b[38;5;129;01mand\u001b[39;00m (total_norm\u001b[38;5;241m.\u001b[39misnan() \u001b[38;5;129;01mor\u001b[39;00m total_norm\u001b[38;5;241m.\u001b[39misinf()):\n\u001b[1;32m 45\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[1;32m 46\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mThe norm of order \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnorm_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m for a gradient from `parameters` \u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 47\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mis non-finite, so it cannot be clipped. This error can be \u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 48\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdisabled with `error_if_nonfinite=False`\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m---> 49\u001b[0m clip_coef \u001b[38;5;241m=\u001b[39m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtensor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmax_norm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevice\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdevice\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;241m/\u001b[39m (total_norm \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1e-6\u001b[39m)\n\u001b[1;32m 50\u001b[0m clip_value \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mwhere(clip_coef \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m1\u001b[39m, clip_coef,\n\u001b[1;32m 51\u001b[0m torch\u001b[38;5;241m.\u001b[39mtensor(\u001b[38;5;241m1.\u001b[39m, device\u001b[38;5;241m=\u001b[39mdevice))\n\u001b[1;32m 52\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m p \u001b[38;5;129;01min\u001b[39;00m parameters:\n",
|
201 |
-
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
|
202 |
-
]
|
203 |
}
|
204 |
],
|
205 |
"source": [
|
|
|
9 |
{
|
10 |
"data": {
|
11 |
"application/vnd.jupyter.widget-view+json": {
|
12 |
+
"model_id": "eab00695e2b240ffb58ab998c85c0e7d",
|
13 |
"version_major": 2,
|
14 |
"version_minor": 0
|
15 |
},
|
|
|
81 |
{
|
82 |
"data": {
|
83 |
"application/vnd.jupyter.widget-view+json": {
|
84 |
+
"model_id": "f20c88d0f96c4c06a9a0ddf835e544e3",
|
85 |
"version_major": 2,
|
86 |
"version_minor": 0
|
87 |
},
|
|
|
103 |
{
|
104 |
"data": {
|
105 |
"application/vnd.jupyter.widget-view+json": {
|
106 |
+
"model_id": "97591288ca9d40fd91e7737b41828f63",
|
107 |
"version_major": 2,
|
108 |
"version_minor": 0
|
109 |
},
|
|
|
125 |
},
|
126 |
{
|
127 |
"cell_type": "code",
|
128 |
+
"execution_count": null,
|
129 |
"id": "6bc83d4c-378c-4313-b641-8ead0c02f715",
|
130 |
"metadata": {},
|
131 |
"outputs": [
|
|
|
144 |
"\n",
|
145 |
" <div>\n",
|
146 |
" \n",
|
147 |
+
" <progress value='576' max='30600' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
|
148 |
+
" [ 576/30600 01:07 < 58:30, 8.55 it/s, Epoch 1.88/100]\n",
|
149 |
" </div>\n",
|
150 |
" <table border=\"1\" class=\"dataframe\">\n",
|
151 |
" <thead>\n",
|
|
|
159 |
" <tr>\n",
|
160 |
" <td>1</td>\n",
|
161 |
" <td>No log</td>\n",
|
162 |
+
" <td>0.085583</td>\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
" </tr>\n",
|
164 |
" </tbody>\n",
|
165 |
"</table><p>"
|
|
|
170 |
},
|
171 |
"metadata": {},
|
172 |
"output_type": "display_data"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
}
|
174 |
],
|
175 |
"source": [
|