Josh Cole commited on
Commit
d8ed61a
·
1 Parent(s): fd184d9
Files changed (5) hide show
  1. Generate.ipynb +64 -35
  2. config.json +1 -1
  3. pytorch_model.bin +1 -1
  4. training_args.bin +1 -1
  5. vocab.json +1 -1
Generate.ipynb CHANGED
@@ -28,7 +28,7 @@
28
  },
29
  {
30
  "cell_type": "code",
31
- "execution_count": 2,
32
  "id": "38bdf299-f60d-43ea-9230-df1be861e406",
33
  "metadata": {},
34
  "outputs": [
@@ -36,14 +36,14 @@
36
  "name": "stderr",
37
  "output_type": "stream",
38
  "text": [
39
- "Using custom data configuration sharpcoder--bjorn_training-8c32a3534606a113\n",
40
- "Reusing dataset parquet (/home/sharpcoder/.cache/huggingface/datasets/sharpcoder___parquet/sharpcoder--bjorn_training-8c32a3534606a113/0.0.0/7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8)\n"
41
  ]
42
  },
43
  {
44
  "data": {
45
  "application/vnd.jupyter.widget-view+json": {
46
- "model_id": "18cae671f8fd4f9baac804c91fee03bf",
47
  "version_major": 2,
48
  "version_minor": 0
49
  },
@@ -62,14 +62,14 @@
62
  },
63
  {
64
  "cell_type": "code",
65
- "execution_count": 22,
66
  "id": "75b32151-eb53-4476-8c1f-7e6da72e173e",
67
  "metadata": {},
68
  "outputs": [
69
  {
70
  "data": {
71
  "application/vnd.jupyter.widget-view+json": {
72
- "model_id": "0611b2fa6cf740d6925d03cf3ba525a2",
73
  "version_major": 2,
74
  "version_minor": 0
75
  },
@@ -102,7 +102,7 @@
102
  },
103
  {
104
  "cell_type": "code",
105
- "execution_count": 23,
106
  "id": "d214872e-d4b1-4aa7-be07-8a1591961968",
107
  "metadata": {},
108
  "outputs": [],
@@ -111,60 +111,89 @@
111
  "from transformers import Wav2Vec2FeatureExtractor\n",
112
  "from transformers import Wav2Vec2Processor\n",
113
  "\n",
114
- "tokenizer = Wav2Vec2CTCTokenizer(\"./vocab.json\", unk_token=\"[UNK]\", pad_token=\"[PAD]\", word_delimiter_token=\" \")\n",
115
  "feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)\n",
116
  "processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)"
117
  ]
118
  },
119
  {
120
  "cell_type": "code",
121
- "execution_count": 24,
122
  "id": "e906c45f-6971-43c3-ad0a-b13363100bdf",
123
  "metadata": {},
124
  "outputs": [],
125
  "source": [
126
  "def prepare_dataset(batch):\n",
127
- " audio = batch[\"audio\"]\n",
128
- "\n",
129
  " # batched output is \"un-batched\" to ensure mapping is correct\n",
130
  " batch[\"input_values\"] = processor(audio[\"array\"], sampling_rate=audio[\"sample_rate\"]).input_values[0]\n",
131
  " batch[\"input_length\"] = len(batch[\"input_values\"])\n",
132
  " \n",
133
  " with processor.as_target_processor():\n",
134
  " batch[\"labels\"] = processor(batch[\"text\"]).input_ids\n",
135
- " return batch"
136
  ]
137
  },
138
  {
139
  "cell_type": "code",
140
- "execution_count": 25,
141
- "id": "8c083db6-eab5-4f25-9a08-eab50d2d30ac",
142
  "metadata": {},
143
  "outputs": [
144
  {
145
- "name": "stderr",
146
- "output_type": "stream",
147
- "text": [
148
- "num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.\n"
149
- ]
150
- },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  {
152
  "data": {
153
  "application/vnd.jupyter.widget-view+json": {
154
- "model_id": "ae21f7b6a50241e4ab4dd2b5c7c5689c",
155
  "version_major": 2,
156
  "version_minor": 0
157
  },
158
  "text/plain": [
159
- " 0%| | 0/1 [00:00<?, ?ex/s]"
160
  ]
161
  },
162
  "metadata": {},
163
  "output_type": "display_data"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  }
165
  ],
166
  "source": [
167
- "ds_prepared = ds.map(prepare_dataset, remove_columns=ds.column_names[\"train\"], num_proc=4)"
168
  ]
169
  },
170
  {
@@ -253,7 +282,7 @@
253
  },
254
  {
255
  "cell_type": "code",
256
- "execution_count": 54,
257
  "id": "71351cf4-6d00-40ae-89cc-cedb87073625",
258
  "metadata": {},
259
  "outputs": [
@@ -363,7 +392,7 @@
363
  },
364
  {
365
  "cell_type": "code",
366
- "execution_count": 55,
367
  "id": "208eac7d-9fdd-4c82-b46f-25c1a1f246ee",
368
  "metadata": {},
369
  "outputs": [
@@ -385,7 +414,7 @@
385
  " group_by_length=True,\n",
386
  " per_device_train_batch_size=8,\n",
387
  " evaluation_strategy=\"steps\",\n",
388
- " num_train_epochs=30,\n",
389
  " fp16=False,\n",
390
  " gradient_checkpointing=True,\n",
391
  " save_steps=500,\n",
@@ -410,7 +439,7 @@
410
  },
411
  {
412
  "cell_type": "code",
413
- "execution_count": 56,
414
  "id": "d58f6b8c-441c-4fa9-a308-e687948875e1",
415
  "metadata": {},
416
  "outputs": [
@@ -421,11 +450,11 @@
421
  "The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.\n",
422
  "***** Running training *****\n",
423
  " Num examples = 1\n",
424
- " Num Epochs = 30\n",
425
  " Instantaneous batch size per device = 8\n",
426
  " Total train batch size (w. parallel, distributed & accumulation) = 8\n",
427
  " Gradient Accumulation steps = 1\n",
428
- " Total optimization steps = 30\n"
429
  ]
430
  },
431
  {
@@ -434,8 +463,8 @@
434
  "\n",
435
  " <div>\n",
436
  " \n",
437
- " <progress value='30' max='30' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
438
- " [30/30 00:28, Epoch 30/30]\n",
439
  " </div>\n",
440
  " <table border=\"1\" class=\"dataframe\">\n",
441
  " <thead>\n",
@@ -470,10 +499,10 @@
470
  {
471
  "data": {
472
  "text/plain": [
473
- "TrainOutput(global_step=30, training_loss=16.291970825195314, metrics={'train_runtime': 29.1768, 'train_samples_per_second': 1.028, 'train_steps_per_second': 1.028, 'total_flos': 943749864316800.0, 'train_loss': 16.291970825195314, 'epoch': 30.0})"
474
  ]
475
  },
476
- "execution_count": 56,
477
  "metadata": {},
478
  "output_type": "execute_result"
479
  }
@@ -484,7 +513,7 @@
484
  },
485
  {
486
  "cell_type": "code",
487
- "execution_count": 57,
488
  "id": "333d43cf-add3-4d78-bbca-b44c638519fe",
489
  "metadata": {},
490
  "outputs": [
@@ -505,7 +534,7 @@
505
  "traceback": [
506
  "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
507
  "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
508
- "Input \u001b[0;32mIn [57]\u001b[0m, in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpush_to_hub\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhub_model_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msharpcoder/wav2vec2_bjorn\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
509
  "File \u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/trainer.py:2677\u001b[0m, in \u001b[0;36mTrainer.push_to_hub\u001b[0;34m(self, commit_message, blocking, **kwargs)\u001b[0m\n\u001b[1;32m 2674\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mis_world_process_zero():\n\u001b[1;32m 2675\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[0;32m-> 2677\u001b[0m git_head_commit_url \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrepo\u001b[49m\u001b[38;5;241m.\u001b[39mpush_to_hub(commit_message\u001b[38;5;241m=\u001b[39mcommit_message, blocking\u001b[38;5;241m=\u001b[39mblocking)\n\u001b[1;32m 2678\u001b[0m \u001b[38;5;66;03m# push separately the model card to be independant from the rest of the model\u001b[39;00m\n\u001b[1;32m 2679\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mshould_save:\n",
510
  "\u001b[0;31mAttributeError\u001b[0m: 'Trainer' object has no attribute 'repo'"
511
  ]
 
28
  },
29
  {
30
  "cell_type": "code",
31
+ "execution_count": 87,
32
  "id": "38bdf299-f60d-43ea-9230-df1be861e406",
33
  "metadata": {},
34
  "outputs": [
 
36
  "name": "stderr",
37
  "output_type": "stream",
38
  "text": [
39
+ "Using custom data configuration sharpcoder--bjorn_training-49dfdd879ea26ec8\n",
40
+ "Reusing dataset parquet (/home/sharpcoder/.cache/huggingface/datasets/sharpcoder___parquet/sharpcoder--bjorn_training-49dfdd879ea26ec8/0.0.0/7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8)\n"
41
  ]
42
  },
43
  {
44
  "data": {
45
  "application/vnd.jupyter.widget-view+json": {
46
+ "model_id": "7ace7bfb1aee4fe3946da3cf4616edb6",
47
  "version_major": 2,
48
  "version_minor": 0
49
  },
 
62
  },
63
  {
64
  "cell_type": "code",
65
+ "execution_count": 89,
66
  "id": "75b32151-eb53-4476-8c1f-7e6da72e173e",
67
  "metadata": {},
68
  "outputs": [
69
  {
70
  "data": {
71
  "application/vnd.jupyter.widget-view+json": {
72
+ "model_id": "5d7fae2ce3cb4c9c9be58bf46d991f3d",
73
  "version_major": 2,
74
  "version_minor": 0
75
  },
 
102
  },
103
  {
104
  "cell_type": "code",
105
+ "execution_count": 90,
106
  "id": "d214872e-d4b1-4aa7-be07-8a1591961968",
107
  "metadata": {},
108
  "outputs": [],
 
111
  "from transformers import Wav2Vec2FeatureExtractor\n",
112
  "from transformers import Wav2Vec2Processor\n",
113
  "\n",
114
+ "tokenizer = Wav2Vec2CTCTokenizer(\"./vocab.json\", unk_token=\"[UNK]\", pad_token=\"[PAD]\", word_delimiter_token=\"|\")\n",
115
  "feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)\n",
116
  "processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)"
117
  ]
118
  },
119
  {
120
  "cell_type": "code",
121
+ "execution_count": 107,
122
  "id": "e906c45f-6971-43c3-ad0a-b13363100bdf",
123
  "metadata": {},
124
  "outputs": [],
125
  "source": [
126
  "def prepare_dataset(batch):\n",
127
+ " audio = batch[\"audio\"][0]\n",
 
128
  " # batched output is \"un-batched\" to ensure mapping is correct\n",
129
  " batch[\"input_values\"] = processor(audio[\"array\"], sampling_rate=audio[\"sample_rate\"]).input_values[0]\n",
130
  " batch[\"input_length\"] = len(batch[\"input_values\"])\n",
131
  " \n",
132
  " with processor.as_target_processor():\n",
133
  " batch[\"labels\"] = processor(batch[\"text\"]).input_ids\n",
134
+ " return batch\n"
135
  ]
136
  },
137
  {
138
  "cell_type": "code",
139
+ "execution_count": 108,
140
+ "id": "859e5dce-cb41-4647-98cb-de084a4b9d7e",
141
  "metadata": {},
142
  "outputs": [
143
  {
144
+ "data": {
145
+ "text/plain": [
146
+ "['audio', 'text']"
147
+ ]
148
+ },
149
+ "execution_count": 108,
150
+ "metadata": {},
151
+ "output_type": "execute_result"
152
+ }
153
+ ],
154
+ "source": [
155
+ "ds.column_names['train']"
156
+ ]
157
+ },
158
+ {
159
+ "cell_type": "code",
160
+ "execution_count": 109,
161
+ "id": "8c083db6-eab5-4f25-9a08-eab50d2d30ac",
162
+ "metadata": {},
163
+ "outputs": [
164
  {
165
  "data": {
166
  "application/vnd.jupyter.widget-view+json": {
167
+ "model_id": "7575024d6ef048e39d7b25654f993957",
168
  "version_major": 2,
169
  "version_minor": 0
170
  },
171
  "text/plain": [
172
+ " 0%| | 0/4 [00:00<?, ?ex/s]"
173
  ]
174
  },
175
  "metadata": {},
176
  "output_type": "display_data"
177
+ },
178
+ {
179
+ "name": "stderr",
180
+ "output_type": "stream",
181
+ "text": [
182
+ "IOPub data rate exceeded.\n",
183
+ "The Jupyter server will temporarily stop sending output\n",
184
+ "to the client in order to avoid crashing it.\n",
185
+ "To change this limit, set the config variable\n",
186
+ "`--ServerApp.iopub_data_rate_limit`.\n",
187
+ "\n",
188
+ "Current values:\n",
189
+ "ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)\n",
190
+ "ServerApp.rate_limit_window=3.0 (secs)\n",
191
+ "\n"
192
+ ]
193
  }
194
  ],
195
  "source": [
196
+ "ds_prepared = ds.map(prepare_dataset, remove_columns=ds.column_names[\"train\"], num_proc=1)"
197
  ]
198
  },
199
  {
 
282
  },
283
  {
284
  "cell_type": "code",
285
+ "execution_count": 58,
286
  "id": "71351cf4-6d00-40ae-89cc-cedb87073625",
287
  "metadata": {},
288
  "outputs": [
 
392
  },
393
  {
394
  "cell_type": "code",
395
+ "execution_count": 59,
396
  "id": "208eac7d-9fdd-4c82-b46f-25c1a1f246ee",
397
  "metadata": {},
398
  "outputs": [
 
414
  " group_by_length=True,\n",
415
  " per_device_train_batch_size=8,\n",
416
  " evaluation_strategy=\"steps\",\n",
417
+ " num_train_epochs=2,\n",
418
  " fp16=False,\n",
419
  " gradient_checkpointing=True,\n",
420
  " save_steps=500,\n",
 
439
  },
440
  {
441
  "cell_type": "code",
442
+ "execution_count": 60,
443
  "id": "d58f6b8c-441c-4fa9-a308-e687948875e1",
444
  "metadata": {},
445
  "outputs": [
 
450
  "The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.\n",
451
  "***** Running training *****\n",
452
  " Num examples = 1\n",
453
+ " Num Epochs = 2\n",
454
  " Instantaneous batch size per device = 8\n",
455
  " Total train batch size (w. parallel, distributed & accumulation) = 8\n",
456
  " Gradient Accumulation steps = 1\n",
457
+ " Total optimization steps = 2\n"
458
  ]
459
  },
460
  {
 
463
  "\n",
464
  " <div>\n",
465
  " \n",
466
+ " <progress value='2' max='2' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
467
+ " [2/2 00:01, Epoch 2/2]\n",
468
  " </div>\n",
469
  " <table border=\"1\" class=\"dataframe\">\n",
470
  " <thead>\n",
 
499
  {
500
  "data": {
501
  "text/plain": [
502
+ "TrainOutput(global_step=2, training_loss=16.662765502929688, metrics={'train_runtime': 1.915, 'train_samples_per_second': 1.044, 'train_steps_per_second': 1.044, 'total_flos': 62916657621120.0, 'train_loss': 16.662765502929688, 'epoch': 2.0})"
503
  ]
504
  },
505
+ "execution_count": 60,
506
  "metadata": {},
507
  "output_type": "execute_result"
508
  }
 
513
  },
514
  {
515
  "cell_type": "code",
516
+ "execution_count": 61,
517
  "id": "333d43cf-add3-4d78-bbca-b44c638519fe",
518
  "metadata": {},
519
  "outputs": [
 
534
  "traceback": [
535
  "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
536
  "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
537
+ "Input \u001b[0;32mIn [61]\u001b[0m, in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpush_to_hub\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhub_model_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msharpcoder/wav2vec2_bjorn\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
538
  "File \u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/trainer.py:2677\u001b[0m, in \u001b[0;36mTrainer.push_to_hub\u001b[0;34m(self, commit_message, blocking, **kwargs)\u001b[0m\n\u001b[1;32m 2674\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mis_world_process_zero():\n\u001b[1;32m 2675\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[0;32m-> 2677\u001b[0m git_head_commit_url \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrepo\u001b[49m\u001b[38;5;241m.\u001b[39mpush_to_hub(commit_message\u001b[38;5;241m=\u001b[39mcommit_message, blocking\u001b[38;5;241m=\u001b[39mblocking)\n\u001b[1;32m 2678\u001b[0m \u001b[38;5;66;03m# push separately the model card to be independant from the rest of the model\u001b[39;00m\n\u001b[1;32m 2679\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mshould_save:\n",
539
  "\u001b[0;31mAttributeError\u001b[0m: 'Trainer' object has no attribute 'repo'"
540
  ]
config.json CHANGED
@@ -71,7 +71,7 @@
71
  "num_feat_extract_layers": 7,
72
  "num_hidden_layers": 12,
73
  "num_negatives": 100,
74
- "pad_token_id": 19,
75
  "proj_codevector_dim": 256,
76
  "torch_dtype": "float32",
77
  "transformers_version": "4.11.3",
 
71
  "num_feat_extract_layers": 7,
72
  "num_hidden_layers": 12,
73
  "num_negatives": 100,
74
+ "pad_token_id": 26,
75
  "proj_codevector_dim": 256,
76
  "torch_dtype": "float32",
77
  "transformers_version": "4.11.3",
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6e55a1042cf8cc902bd68a3e8798f90a9cbb95b313e4a3c79082b4dc9d0fc05f
3
  size 377667031
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:124b58d6d3dbb0ca4f29d31fad8c5ad9a70bc43d141b954dd380d21dfebedc17
3
  size 377667031
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb6aaa145951105af08a3ef5d6fd296d211fe596176abfe9aee5116147e093b5
3
  size 2735
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d088aa47e4ea7b9e0e9873e040b77e2eb035cf9848d076792a823f0550eed203
3
  size 2735
vocab.json CHANGED
@@ -1 +1 @@
1
- {"w": 0, "a": 1, "o": 3, "e": 4, "j": 5, "n": 6, "p": 7, "l": 8, ".": 9, "i": 10, "b": 11, "d": 12, "h": 13, "r": 14, "y": 15, "m": 16, "s": 17, "|": 2, "[UNK]": 18, "[PAD]": 19}
 
1
+ {"i": 1, "e": 2, "p": 3, "h": 4, "c": 5, "r": 6, "x": 7, "m": 8, "v": 9, "w": 10, "|": 0, "j": 12, ".": 13, "d": 14, "y": 15, "a": 16, "f": 17, "s": 18, "l": 19, "u": 20, "o": 21, "n": 22, "b": 23, "t": 24, "g": 25, "[UNK]": 25, "[PAD]": 26}