Josh Cole
commited on
Commit
·
d8ed61a
1
Parent(s):
fd184d9
update
Browse files- Generate.ipynb +64 -35
- config.json +1 -1
- pytorch_model.bin +1 -1
- training_args.bin +1 -1
- vocab.json +1 -1
Generate.ipynb
CHANGED
@@ -28,7 +28,7 @@
|
|
28 |
},
|
29 |
{
|
30 |
"cell_type": "code",
|
31 |
-
"execution_count":
|
32 |
"id": "38bdf299-f60d-43ea-9230-df1be861e406",
|
33 |
"metadata": {},
|
34 |
"outputs": [
|
@@ -36,14 +36,14 @@
|
|
36 |
"name": "stderr",
|
37 |
"output_type": "stream",
|
38 |
"text": [
|
39 |
-
"Using custom data configuration sharpcoder--bjorn_training-
|
40 |
-
"Reusing dataset parquet (/home/sharpcoder/.cache/huggingface/datasets/sharpcoder___parquet/sharpcoder--bjorn_training-
|
41 |
]
|
42 |
},
|
43 |
{
|
44 |
"data": {
|
45 |
"application/vnd.jupyter.widget-view+json": {
|
46 |
-
"model_id": "
|
47 |
"version_major": 2,
|
48 |
"version_minor": 0
|
49 |
},
|
@@ -62,14 +62,14 @@
|
|
62 |
},
|
63 |
{
|
64 |
"cell_type": "code",
|
65 |
-
"execution_count":
|
66 |
"id": "75b32151-eb53-4476-8c1f-7e6da72e173e",
|
67 |
"metadata": {},
|
68 |
"outputs": [
|
69 |
{
|
70 |
"data": {
|
71 |
"application/vnd.jupyter.widget-view+json": {
|
72 |
-
"model_id": "
|
73 |
"version_major": 2,
|
74 |
"version_minor": 0
|
75 |
},
|
@@ -102,7 +102,7 @@
|
|
102 |
},
|
103 |
{
|
104 |
"cell_type": "code",
|
105 |
-
"execution_count":
|
106 |
"id": "d214872e-d4b1-4aa7-be07-8a1591961968",
|
107 |
"metadata": {},
|
108 |
"outputs": [],
|
@@ -111,60 +111,89 @@
|
|
111 |
"from transformers import Wav2Vec2FeatureExtractor\n",
|
112 |
"from transformers import Wav2Vec2Processor\n",
|
113 |
"\n",
|
114 |
-
"tokenizer = Wav2Vec2CTCTokenizer(\"./vocab.json\", unk_token=\"[UNK]\", pad_token=\"[PAD]\", word_delimiter_token=\"
|
115 |
"feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)\n",
|
116 |
"processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)"
|
117 |
]
|
118 |
},
|
119 |
{
|
120 |
"cell_type": "code",
|
121 |
-
"execution_count":
|
122 |
"id": "e906c45f-6971-43c3-ad0a-b13363100bdf",
|
123 |
"metadata": {},
|
124 |
"outputs": [],
|
125 |
"source": [
|
126 |
"def prepare_dataset(batch):\n",
|
127 |
-
" audio = batch[\"audio\"]\n",
|
128 |
-
"\n",
|
129 |
" # batched output is \"un-batched\" to ensure mapping is correct\n",
|
130 |
" batch[\"input_values\"] = processor(audio[\"array\"], sampling_rate=audio[\"sample_rate\"]).input_values[0]\n",
|
131 |
" batch[\"input_length\"] = len(batch[\"input_values\"])\n",
|
132 |
" \n",
|
133 |
" with processor.as_target_processor():\n",
|
134 |
" batch[\"labels\"] = processor(batch[\"text\"]).input_ids\n",
|
135 |
-
" return batch"
|
136 |
]
|
137 |
},
|
138 |
{
|
139 |
"cell_type": "code",
|
140 |
-
"execution_count":
|
141 |
-
"id": "
|
142 |
"metadata": {},
|
143 |
"outputs": [
|
144 |
{
|
145 |
-
"
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
{
|
152 |
"data": {
|
153 |
"application/vnd.jupyter.widget-view+json": {
|
154 |
-
"model_id": "
|
155 |
"version_major": 2,
|
156 |
"version_minor": 0
|
157 |
},
|
158 |
"text/plain": [
|
159 |
-
" 0%| | 0/
|
160 |
]
|
161 |
},
|
162 |
"metadata": {},
|
163 |
"output_type": "display_data"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
}
|
165 |
],
|
166 |
"source": [
|
167 |
-
"ds_prepared = ds.map(prepare_dataset, remove_columns=ds.column_names[\"train\"], num_proc=
|
168 |
]
|
169 |
},
|
170 |
{
|
@@ -253,7 +282,7 @@
|
|
253 |
},
|
254 |
{
|
255 |
"cell_type": "code",
|
256 |
-
"execution_count":
|
257 |
"id": "71351cf4-6d00-40ae-89cc-cedb87073625",
|
258 |
"metadata": {},
|
259 |
"outputs": [
|
@@ -363,7 +392,7 @@
|
|
363 |
},
|
364 |
{
|
365 |
"cell_type": "code",
|
366 |
-
"execution_count":
|
367 |
"id": "208eac7d-9fdd-4c82-b46f-25c1a1f246ee",
|
368 |
"metadata": {},
|
369 |
"outputs": [
|
@@ -385,7 +414,7 @@
|
|
385 |
" group_by_length=True,\n",
|
386 |
" per_device_train_batch_size=8,\n",
|
387 |
" evaluation_strategy=\"steps\",\n",
|
388 |
-
" num_train_epochs=
|
389 |
" fp16=False,\n",
|
390 |
" gradient_checkpointing=True,\n",
|
391 |
" save_steps=500,\n",
|
@@ -410,7 +439,7 @@
|
|
410 |
},
|
411 |
{
|
412 |
"cell_type": "code",
|
413 |
-
"execution_count":
|
414 |
"id": "d58f6b8c-441c-4fa9-a308-e687948875e1",
|
415 |
"metadata": {},
|
416 |
"outputs": [
|
@@ -421,11 +450,11 @@
|
|
421 |
"The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.\n",
|
422 |
"***** Running training *****\n",
|
423 |
" Num examples = 1\n",
|
424 |
-
" Num Epochs =
|
425 |
" Instantaneous batch size per device = 8\n",
|
426 |
" Total train batch size (w. parallel, distributed & accumulation) = 8\n",
|
427 |
" Gradient Accumulation steps = 1\n",
|
428 |
-
" Total optimization steps =
|
429 |
]
|
430 |
},
|
431 |
{
|
@@ -434,8 +463,8 @@
|
|
434 |
"\n",
|
435 |
" <div>\n",
|
436 |
" \n",
|
437 |
-
" <progress value='
|
438 |
-
" [
|
439 |
" </div>\n",
|
440 |
" <table border=\"1\" class=\"dataframe\">\n",
|
441 |
" <thead>\n",
|
@@ -470,10 +499,10 @@
|
|
470 |
{
|
471 |
"data": {
|
472 |
"text/plain": [
|
473 |
-
"TrainOutput(global_step=
|
474 |
]
|
475 |
},
|
476 |
-
"execution_count":
|
477 |
"metadata": {},
|
478 |
"output_type": "execute_result"
|
479 |
}
|
@@ -484,7 +513,7 @@
|
|
484 |
},
|
485 |
{
|
486 |
"cell_type": "code",
|
487 |
-
"execution_count":
|
488 |
"id": "333d43cf-add3-4d78-bbca-b44c638519fe",
|
489 |
"metadata": {},
|
490 |
"outputs": [
|
@@ -505,7 +534,7 @@
|
|
505 |
"traceback": [
|
506 |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
507 |
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
|
508 |
-
"Input \u001b[0;32mIn [
|
509 |
"File \u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/trainer.py:2677\u001b[0m, in \u001b[0;36mTrainer.push_to_hub\u001b[0;34m(self, commit_message, blocking, **kwargs)\u001b[0m\n\u001b[1;32m 2674\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mis_world_process_zero():\n\u001b[1;32m 2675\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[0;32m-> 2677\u001b[0m git_head_commit_url \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrepo\u001b[49m\u001b[38;5;241m.\u001b[39mpush_to_hub(commit_message\u001b[38;5;241m=\u001b[39mcommit_message, blocking\u001b[38;5;241m=\u001b[39mblocking)\n\u001b[1;32m 2678\u001b[0m \u001b[38;5;66;03m# push separately the model card to be independant from the rest of the model\u001b[39;00m\n\u001b[1;32m 2679\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mshould_save:\n",
|
510 |
"\u001b[0;31mAttributeError\u001b[0m: 'Trainer' object has no attribute 'repo'"
|
511 |
]
|
|
|
28 |
},
|
29 |
{
|
30 |
"cell_type": "code",
|
31 |
+
"execution_count": 87,
|
32 |
"id": "38bdf299-f60d-43ea-9230-df1be861e406",
|
33 |
"metadata": {},
|
34 |
"outputs": [
|
|
|
36 |
"name": "stderr",
|
37 |
"output_type": "stream",
|
38 |
"text": [
|
39 |
+
"Using custom data configuration sharpcoder--bjorn_training-49dfdd879ea26ec8\n",
|
40 |
+
"Reusing dataset parquet (/home/sharpcoder/.cache/huggingface/datasets/sharpcoder___parquet/sharpcoder--bjorn_training-49dfdd879ea26ec8/0.0.0/7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8)\n"
|
41 |
]
|
42 |
},
|
43 |
{
|
44 |
"data": {
|
45 |
"application/vnd.jupyter.widget-view+json": {
|
46 |
+
"model_id": "7ace7bfb1aee4fe3946da3cf4616edb6",
|
47 |
"version_major": 2,
|
48 |
"version_minor": 0
|
49 |
},
|
|
|
62 |
},
|
63 |
{
|
64 |
"cell_type": "code",
|
65 |
+
"execution_count": 89,
|
66 |
"id": "75b32151-eb53-4476-8c1f-7e6da72e173e",
|
67 |
"metadata": {},
|
68 |
"outputs": [
|
69 |
{
|
70 |
"data": {
|
71 |
"application/vnd.jupyter.widget-view+json": {
|
72 |
+
"model_id": "5d7fae2ce3cb4c9c9be58bf46d991f3d",
|
73 |
"version_major": 2,
|
74 |
"version_minor": 0
|
75 |
},
|
|
|
102 |
},
|
103 |
{
|
104 |
"cell_type": "code",
|
105 |
+
"execution_count": 90,
|
106 |
"id": "d214872e-d4b1-4aa7-be07-8a1591961968",
|
107 |
"metadata": {},
|
108 |
"outputs": [],
|
|
|
111 |
"from transformers import Wav2Vec2FeatureExtractor\n",
|
112 |
"from transformers import Wav2Vec2Processor\n",
|
113 |
"\n",
|
114 |
+
"tokenizer = Wav2Vec2CTCTokenizer(\"./vocab.json\", unk_token=\"[UNK]\", pad_token=\"[PAD]\", word_delimiter_token=\"|\")\n",
|
115 |
"feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)\n",
|
116 |
"processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)"
|
117 |
]
|
118 |
},
|
119 |
{
|
120 |
"cell_type": "code",
|
121 |
+
"execution_count": 107,
|
122 |
"id": "e906c45f-6971-43c3-ad0a-b13363100bdf",
|
123 |
"metadata": {},
|
124 |
"outputs": [],
|
125 |
"source": [
|
126 |
"def prepare_dataset(batch):\n",
|
127 |
+
" audio = batch[\"audio\"][0]\n",
|
|
|
128 |
" # batched output is \"un-batched\" to ensure mapping is correct\n",
|
129 |
" batch[\"input_values\"] = processor(audio[\"array\"], sampling_rate=audio[\"sample_rate\"]).input_values[0]\n",
|
130 |
" batch[\"input_length\"] = len(batch[\"input_values\"])\n",
|
131 |
" \n",
|
132 |
" with processor.as_target_processor():\n",
|
133 |
" batch[\"labels\"] = processor(batch[\"text\"]).input_ids\n",
|
134 |
+
" return batch\n"
|
135 |
]
|
136 |
},
|
137 |
{
|
138 |
"cell_type": "code",
|
139 |
+
"execution_count": 108,
|
140 |
+
"id": "859e5dce-cb41-4647-98cb-de084a4b9d7e",
|
141 |
"metadata": {},
|
142 |
"outputs": [
|
143 |
{
|
144 |
+
"data": {
|
145 |
+
"text/plain": [
|
146 |
+
"['audio', 'text']"
|
147 |
+
]
|
148 |
+
},
|
149 |
+
"execution_count": 108,
|
150 |
+
"metadata": {},
|
151 |
+
"output_type": "execute_result"
|
152 |
+
}
|
153 |
+
],
|
154 |
+
"source": [
|
155 |
+
"ds.column_names['train']"
|
156 |
+
]
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"cell_type": "code",
|
160 |
+
"execution_count": 109,
|
161 |
+
"id": "8c083db6-eab5-4f25-9a08-eab50d2d30ac",
|
162 |
+
"metadata": {},
|
163 |
+
"outputs": [
|
164 |
{
|
165 |
"data": {
|
166 |
"application/vnd.jupyter.widget-view+json": {
|
167 |
+
"model_id": "7575024d6ef048e39d7b25654f993957",
|
168 |
"version_major": 2,
|
169 |
"version_minor": 0
|
170 |
},
|
171 |
"text/plain": [
|
172 |
+
" 0%| | 0/4 [00:00<?, ?ex/s]"
|
173 |
]
|
174 |
},
|
175 |
"metadata": {},
|
176 |
"output_type": "display_data"
|
177 |
+
},
|
178 |
+
{
|
179 |
+
"name": "stderr",
|
180 |
+
"output_type": "stream",
|
181 |
+
"text": [
|
182 |
+
"IOPub data rate exceeded.\n",
|
183 |
+
"The Jupyter server will temporarily stop sending output\n",
|
184 |
+
"to the client in order to avoid crashing it.\n",
|
185 |
+
"To change this limit, set the config variable\n",
|
186 |
+
"`--ServerApp.iopub_data_rate_limit`.\n",
|
187 |
+
"\n",
|
188 |
+
"Current values:\n",
|
189 |
+
"ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)\n",
|
190 |
+
"ServerApp.rate_limit_window=3.0 (secs)\n",
|
191 |
+
"\n"
|
192 |
+
]
|
193 |
}
|
194 |
],
|
195 |
"source": [
|
196 |
+
"ds_prepared = ds.map(prepare_dataset, remove_columns=ds.column_names[\"train\"], num_proc=1)"
|
197 |
]
|
198 |
},
|
199 |
{
|
|
|
282 |
},
|
283 |
{
|
284 |
"cell_type": "code",
|
285 |
+
"execution_count": 58,
|
286 |
"id": "71351cf4-6d00-40ae-89cc-cedb87073625",
|
287 |
"metadata": {},
|
288 |
"outputs": [
|
|
|
392 |
},
|
393 |
{
|
394 |
"cell_type": "code",
|
395 |
+
"execution_count": 59,
|
396 |
"id": "208eac7d-9fdd-4c82-b46f-25c1a1f246ee",
|
397 |
"metadata": {},
|
398 |
"outputs": [
|
|
|
414 |
" group_by_length=True,\n",
|
415 |
" per_device_train_batch_size=8,\n",
|
416 |
" evaluation_strategy=\"steps\",\n",
|
417 |
+
" num_train_epochs=2,\n",
|
418 |
" fp16=False,\n",
|
419 |
" gradient_checkpointing=True,\n",
|
420 |
" save_steps=500,\n",
|
|
|
439 |
},
|
440 |
{
|
441 |
"cell_type": "code",
|
442 |
+
"execution_count": 60,
|
443 |
"id": "d58f6b8c-441c-4fa9-a308-e687948875e1",
|
444 |
"metadata": {},
|
445 |
"outputs": [
|
|
|
450 |
"The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.\n",
|
451 |
"***** Running training *****\n",
|
452 |
" Num examples = 1\n",
|
453 |
+
" Num Epochs = 2\n",
|
454 |
" Instantaneous batch size per device = 8\n",
|
455 |
" Total train batch size (w. parallel, distributed & accumulation) = 8\n",
|
456 |
" Gradient Accumulation steps = 1\n",
|
457 |
+
" Total optimization steps = 2\n"
|
458 |
]
|
459 |
},
|
460 |
{
|
|
|
463 |
"\n",
|
464 |
" <div>\n",
|
465 |
" \n",
|
466 |
+
" <progress value='2' max='2' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
|
467 |
+
" [2/2 00:01, Epoch 2/2]\n",
|
468 |
" </div>\n",
|
469 |
" <table border=\"1\" class=\"dataframe\">\n",
|
470 |
" <thead>\n",
|
|
|
499 |
{
|
500 |
"data": {
|
501 |
"text/plain": [
|
502 |
+
"TrainOutput(global_step=2, training_loss=16.662765502929688, metrics={'train_runtime': 1.915, 'train_samples_per_second': 1.044, 'train_steps_per_second': 1.044, 'total_flos': 62916657621120.0, 'train_loss': 16.662765502929688, 'epoch': 2.0})"
|
503 |
]
|
504 |
},
|
505 |
+
"execution_count": 60,
|
506 |
"metadata": {},
|
507 |
"output_type": "execute_result"
|
508 |
}
|
|
|
513 |
},
|
514 |
{
|
515 |
"cell_type": "code",
|
516 |
+
"execution_count": 61,
|
517 |
"id": "333d43cf-add3-4d78-bbca-b44c638519fe",
|
518 |
"metadata": {},
|
519 |
"outputs": [
|
|
|
534 |
"traceback": [
|
535 |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
536 |
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
|
537 |
+
"Input \u001b[0;32mIn [61]\u001b[0m, in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpush_to_hub\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhub_model_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msharpcoder/wav2vec2_bjorn\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
|
538 |
"File \u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/trainer.py:2677\u001b[0m, in \u001b[0;36mTrainer.push_to_hub\u001b[0;34m(self, commit_message, blocking, **kwargs)\u001b[0m\n\u001b[1;32m 2674\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mis_world_process_zero():\n\u001b[1;32m 2675\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[0;32m-> 2677\u001b[0m git_head_commit_url \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrepo\u001b[49m\u001b[38;5;241m.\u001b[39mpush_to_hub(commit_message\u001b[38;5;241m=\u001b[39mcommit_message, blocking\u001b[38;5;241m=\u001b[39mblocking)\n\u001b[1;32m 2678\u001b[0m \u001b[38;5;66;03m# push separately the model card to be independant from the rest of the model\u001b[39;00m\n\u001b[1;32m 2679\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mshould_save:\n",
|
539 |
"\u001b[0;31mAttributeError\u001b[0m: 'Trainer' object has no attribute 'repo'"
|
540 |
]
|
config.json
CHANGED
@@ -71,7 +71,7 @@
|
|
71 |
"num_feat_extract_layers": 7,
|
72 |
"num_hidden_layers": 12,
|
73 |
"num_negatives": 100,
|
74 |
-
"pad_token_id":
|
75 |
"proj_codevector_dim": 256,
|
76 |
"torch_dtype": "float32",
|
77 |
"transformers_version": "4.11.3",
|
|
|
71 |
"num_feat_extract_layers": 7,
|
72 |
"num_hidden_layers": 12,
|
73 |
"num_negatives": 100,
|
74 |
+
"pad_token_id": 26,
|
75 |
"proj_codevector_dim": 256,
|
76 |
"torch_dtype": "float32",
|
77 |
"transformers_version": "4.11.3",
|
pytorch_model.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 377667031
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:124b58d6d3dbb0ca4f29d31fad8c5ad9a70bc43d141b954dd380d21dfebedc17
|
3 |
size 377667031
|
training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 2735
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d088aa47e4ea7b9e0e9873e040b77e2eb035cf9848d076792a823f0550eed203
|
3 |
size 2735
|
vocab.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"
|
|
|
1 |
+
{"i": 1, "e": 2, "p": 3, "h": 4, "c": 5, "r": 6, "x": 7, "m": 8, "v": 9, "w": 10, "|": 0, "j": 12, ".": 13, "d": 14, "y": 15, "a": 16, "f": 17, "s": 18, "l": 19, "u": 20, "o": 21, "n": 22, "b": 23, "t": 24, "g": 25, "[UNK]": 25, "[PAD]": 26}
|