Josh Cole commited on
Commit
20c1366
·
1 Parent(s): 6e2f9e3

initial commit

Browse files
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .ipynb_checkpoints/
Generate.ipynb ADDED
@@ -0,0 +1,562 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "5205c0d3-2272-4a43-9345-9553af479fe6",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "data": {
11
+ "application/vnd.jupyter.widget-view+json": {
12
+ "model_id": "50bf0f78f5f044dd8be6b181b2cb0949",
13
+ "version_major": 2,
14
+ "version_minor": 0
15
+ },
16
+ "text/plain": [
17
+ "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
18
+ ]
19
+ },
20
+ "metadata": {},
21
+ "output_type": "display_data"
22
+ }
23
+ ],
24
+ "source": [
25
+ "from huggingface_hub import notebook_login\n",
26
+ "notebook_login()"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": 3,
32
+ "id": "38bdf299-f60d-43ea-9230-df1be861e406",
33
+ "metadata": {},
34
+ "outputs": [
35
+ {
36
+ "name": "stderr",
37
+ "output_type": "stream",
38
+ "text": [
39
+ "Using custom data configuration sharpcoder--bjorn_training-8c32a3534606a113\n",
40
+ "Reusing dataset parquet (/home/sharpcoder/.cache/huggingface/datasets/sharpcoder___parquet/sharpcoder--bjorn_training-8c32a3534606a113/0.0.0/7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8)\n"
41
+ ]
42
+ },
43
+ {
44
+ "data": {
45
+ "application/vnd.jupyter.widget-view+json": {
46
+ "model_id": "c495fe2f4a44499fb32751d60ac1488e",
47
+ "version_major": 2,
48
+ "version_minor": 0
49
+ },
50
+ "text/plain": [
51
+ " 0%| | 0/1 [00:00<?, ?it/s]"
52
+ ]
53
+ },
54
+ "metadata": {},
55
+ "output_type": "display_data"
56
+ }
57
+ ],
58
+ "source": [
59
+ "from datasets import load_dataset, load_metric\n",
60
+ "ds = load_dataset(\"sharpcoder/bjorn_training\")"
61
+ ]
62
+ },
63
+ {
64
+ "cell_type": "code",
65
+ "execution_count": 38,
66
+ "id": "75b32151-eb53-4476-8c1f-7e6da72e173e",
67
+ "metadata": {},
68
+ "outputs": [
69
+ {
70
+ "data": {
71
+ "application/vnd.jupyter.widget-view+json": {
72
+ "model_id": "0f019d1f864b4b56af5c828588fd89bf",
73
+ "version_major": 2,
74
+ "version_minor": 0
75
+ },
76
+ "text/plain": [
77
+ " 0%| | 0/1 [00:00<?, ?ba/s]"
78
+ ]
79
+ },
80
+ "metadata": {},
81
+ "output_type": "display_data"
82
+ }
83
+ ],
84
+ "source": [
85
+ "def extract_all_chars(batch):\n",
86
+ " all_text = \" \".join(batch[\"text\"])\n",
87
+ " vocab = list(set(all_text))\n",
88
+ " return {\"vocab\": [vocab], \"all_text\": [all_text]}\n",
89
+ "\n",
90
+ "vocabs = ds.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=ds.column_names[\"train\"])\n",
91
+ "vocab_list = list(set(vocabs[\"train\"][\"vocab\"][0]) | set(vocabs[\"train\"][\"vocab\"][0]))\n",
92
+ "vocab_dict = {v: k for k, v in enumerate(vocab_list)}\n",
93
+ "vocab_dict[\"|\"] = vocab_dict[\" \"]\n",
94
+ "del vocab_dict[\" \"]\n",
95
+ "vocab_dict[\"[UNK]\"] = len(vocab_dict)\n",
96
+ "vocab_dict[\"[PAD]\"] = len(vocab_dict)\n",
97
+ "len(vocab_dict)\n",
98
+ "import json\n",
99
+ "with open('vocab.json', 'w') as vocab_file:\n",
100
+ " json.dump(vocab_dict, vocab_file)"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": 39,
106
+ "id": "d214872e-d4b1-4aa7-be07-8a1591961968",
107
+ "metadata": {},
108
+ "outputs": [],
109
+ "source": [
110
+ "from transformers import Wav2Vec2CTCTokenizer\n",
111
+ "from transformers import Wav2Vec2FeatureExtractor\n",
112
+ "from transformers import Wav2Vec2Processor\n",
113
+ "\n",
114
+ "tokenizer = Wav2Vec2CTCTokenizer(\"./vocab.json\", unk_token=\"[UNK]\", pad_token=\"[PAD]\", word_delimiter_token=\"|\")\n",
115
+ "feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)\n",
116
+ "processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)"
117
+ ]
118
+ },
119
+ {
120
+ "cell_type": "code",
121
+ "execution_count": 40,
122
+ "id": "e906c45f-6971-43c3-ad0a-b13363100bdf",
123
+ "metadata": {},
124
+ "outputs": [],
125
+ "source": [
126
+ "def prepare_dataset(batch):\n",
127
+ " audio = batch[\"audio\"]\n",
128
+ "\n",
129
+ " # batched output is \"un-batched\" to ensure mapping is correct\n",
130
+ " batch[\"input_values\"] = processor(audio[\"array\"], sampling_rate=audio[\"sample_rate\"]).input_values[0]\n",
131
+ " batch[\"input_length\"] = len(batch[\"input_values\"])\n",
132
+ " \n",
133
+ " with processor.as_target_processor():\n",
134
+ " batch[\"labels\"] = processor(batch[\"text\"]).input_ids\n",
135
+ " return batch"
136
+ ]
137
+ },
138
+ {
139
+ "cell_type": "code",
140
+ "execution_count": 41,
141
+ "id": "8c083db6-eab5-4f25-9a08-eab50d2d30ac",
142
+ "metadata": {},
143
+ "outputs": [
144
+ {
145
+ "name": "stderr",
146
+ "output_type": "stream",
147
+ "text": [
148
+ "num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.\n"
149
+ ]
150
+ },
151
+ {
152
+ "data": {
153
+ "application/vnd.jupyter.widget-view+json": {
154
+ "model_id": "3b36aee8ffc44253a8381da4d0f4c362",
155
+ "version_major": 2,
156
+ "version_minor": 0
157
+ },
158
+ "text/plain": [
159
+ " 0%| | 0/1 [00:00<?, ?ex/s]"
160
+ ]
161
+ },
162
+ "metadata": {},
163
+ "output_type": "display_data"
164
+ }
165
+ ],
166
+ "source": [
167
+ "ds_prepared = ds.map(prepare_dataset, remove_columns=ds.column_names[\"train\"], num_proc=4)"
168
+ ]
169
+ },
170
+ {
171
+ "cell_type": "code",
172
+ "execution_count": 42,
173
+ "id": "50c9a6ad-9e79-4a1c-a5ce-6e1f73a96e4d",
174
+ "metadata": {},
175
+ "outputs": [],
176
+ "source": [
177
+ "import torch\n",
178
+ "\n",
179
+ "from dataclasses import dataclass, field\n",
180
+ "from typing import Any, Dict, List, Optional, Union\n",
181
+ "\n",
182
+ "@dataclass\n",
183
+ "class DataCollatorCTCWithPadding:\n",
184
+ " \"\"\"\n",
185
+ " Data collator that will dynamically pad the inputs received.\n",
186
+ " Args:\n",
187
+ " processor (:class:`~transformers.Wav2Vec2Processor`)\n",
188
+ " The processor used for proccessing the data.\n",
189
+ " padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):\n",
190
+ " Select a strategy to pad the returned sequences (according to the model's padding side and padding index)\n",
191
+ " among:\n",
192
+ " * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single\n",
193
+ " sequence if provided).\n",
194
+ " * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the\n",
195
+ " maximum acceptable input length for the model if that argument is not provided.\n",
196
+ " * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of\n",
197
+ " different lengths).\n",
198
+ " \"\"\"\n",
199
+ "\n",
200
+ " processor: Wav2Vec2Processor\n",
201
+ " padding: Union[bool, str] = True\n",
202
+ "\n",
203
+ " def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:\n",
204
+ " # split inputs and labels since they have to be of different lenghts and need\n",
205
+ " # different padding methods\n",
206
+ " input_features = [{\"input_values\": feature[\"input_values\"]} for feature in features]\n",
207
+ " label_features = [{\"input_ids\": feature[\"labels\"]} for feature in features]\n",
208
+ "\n",
209
+ " batch = self.processor.pad(\n",
210
+ " input_features,\n",
211
+ " padding=self.padding,\n",
212
+ " return_tensors=\"pt\",\n",
213
+ " )\n",
214
+ " with self.processor.as_target_processor():\n",
215
+ " labels_batch = self.processor.pad(\n",
216
+ " label_features,\n",
217
+ " padding=self.padding,\n",
218
+ " return_tensors=\"pt\",\n",
219
+ " )\n",
220
+ "\n",
221
+ " # replace padding with -100 to ignore loss correctly\n",
222
+ " labels = labels_batch[\"input_ids\"].masked_fill(labels_batch.attention_mask.ne(1), -100)\n",
223
+ "\n",
224
+ " batch[\"labels\"] = labels\n",
225
+ "\n",
226
+ " return batch\n",
227
+ " \n",
228
+ "def compute_metrics(pred):\n",
229
+ " pred_logits = pred.predictions\n",
230
+ " pred_ids = np.argmax(pred_logits, axis=-1)\n",
231
+ "\n",
232
+ " pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id\n",
233
+ "\n",
234
+ " pred_str = processor.batch_decode(pred_ids)\n",
235
+ " # we do not want to group tokens when computing the metrics\n",
236
+ " label_str = processor.batch_decode(pred.label_ids, group_tokens=False)\n",
237
+ "\n",
238
+ " wer = wer_metric.compute(predictions=pred_str, references=label_str)\n",
239
+ "\n",
240
+ " return {\"wer\": wer}"
241
+ ]
242
+ },
243
+ {
244
+ "cell_type": "code",
245
+ "execution_count": 43,
246
+ "id": "1025ffdf-cb83-4895-89ab-a98bc3fab642",
247
+ "metadata": {},
248
+ "outputs": [],
249
+ "source": [
250
+ "data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)\n",
251
+ "wer_metric = load_metric(\"wer\")"
252
+ ]
253
+ },
254
+ {
255
+ "cell_type": "code",
256
+ "execution_count": 44,
257
+ "id": "71351cf4-6d00-40ae-89cc-cedb87073625",
258
+ "metadata": {},
259
+ "outputs": [
260
+ {
261
+ "name": "stderr",
262
+ "output_type": "stream",
263
+ "text": [
264
+ "loading configuration file https://huggingface.co/facebook/wav2vec2-base/resolve/main/config.json from cache at /home/sharpcoder/.cache/huggingface/transformers/c7746642f045322fd01afa31271dd490e677ea11999e68660a92619ec7c892b4.ce1f96bfaf3d7475cb8187b9668c7f19437ade45fb9ceb78d2b06a2cec198015\n",
265
+ "/home/sharpcoder/.local/lib/python3.10/site-packages/transformers/configuration_utils.py:336: UserWarning: Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 Transformers. Using `model.gradient_checkpointing_enable()` instead, or if you are using the `Trainer` API, pass `gradient_checkpointing=True` in your `TrainingArguments`.\n",
266
+ " warnings.warn(\n",
267
+ "Model config Wav2Vec2Config {\n",
268
+ " \"activation_dropout\": 0.0,\n",
269
+ " \"apply_spec_augment\": true,\n",
270
+ " \"architectures\": [\n",
271
+ " \"Wav2Vec2ForPreTraining\"\n",
272
+ " ],\n",
273
+ " \"attention_dropout\": 0.1,\n",
274
+ " \"bos_token_id\": 1,\n",
275
+ " \"classifier_proj_size\": 256,\n",
276
+ " \"codevector_dim\": 256,\n",
277
+ " \"contrastive_logits_temperature\": 0.1,\n",
278
+ " \"conv_bias\": false,\n",
279
+ " \"conv_dim\": [\n",
280
+ " 512,\n",
281
+ " 512,\n",
282
+ " 512,\n",
283
+ " 512,\n",
284
+ " 512,\n",
285
+ " 512,\n",
286
+ " 512\n",
287
+ " ],\n",
288
+ " \"conv_kernel\": [\n",
289
+ " 10,\n",
290
+ " 3,\n",
291
+ " 3,\n",
292
+ " 3,\n",
293
+ " 3,\n",
294
+ " 2,\n",
295
+ " 2\n",
296
+ " ],\n",
297
+ " \"conv_stride\": [\n",
298
+ " 5,\n",
299
+ " 2,\n",
300
+ " 2,\n",
301
+ " 2,\n",
302
+ " 2,\n",
303
+ " 2,\n",
304
+ " 2\n",
305
+ " ],\n",
306
+ " \"ctc_loss_reduction\": \"mean\",\n",
307
+ " \"ctc_zero_infinity\": false,\n",
308
+ " \"diversity_loss_weight\": 0.1,\n",
309
+ " \"do_stable_layer_norm\": false,\n",
310
+ " \"eos_token_id\": 2,\n",
311
+ " \"feat_extract_activation\": \"gelu\",\n",
312
+ " \"feat_extract_norm\": \"group\",\n",
313
+ " \"feat_proj_dropout\": 0.1,\n",
314
+ " \"feat_quantizer_dropout\": 0.0,\n",
315
+ " \"final_dropout\": 0.0,\n",
316
+ " \"freeze_feat_extract_train\": true,\n",
317
+ " \"gradient_checkpointing\": true,\n",
318
+ " \"hidden_act\": \"gelu\",\n",
319
+ " \"hidden_dropout\": 0.1,\n",
320
+ " \"hidden_size\": 768,\n",
321
+ " \"initializer_range\": 0.02,\n",
322
+ " \"intermediate_size\": 3072,\n",
323
+ " \"layer_norm_eps\": 1e-05,\n",
324
+ " \"layerdrop\": 0.0,\n",
325
+ " \"mask_channel_length\": 10,\n",
326
+ " \"mask_channel_min_space\": 1,\n",
327
+ " \"mask_channel_other\": 0.0,\n",
328
+ " \"mask_channel_prob\": 0.0,\n",
329
+ " \"mask_channel_selection\": \"static\",\n",
330
+ " \"mask_feature_length\": 10,\n",
331
+ " \"mask_feature_prob\": 0.0,\n",
332
+ " \"mask_time_length\": 10,\n",
333
+ " \"mask_time_min_space\": 1,\n",
334
+ " \"mask_time_other\": 0.0,\n",
335
+ " \"mask_time_prob\": 0.05,\n",
336
+ " \"mask_time_selection\": \"static\",\n",
337
+ " \"model_type\": \"wav2vec2\",\n",
338
+ " \"no_mask_channel_overlap\": false,\n",
339
+ " \"no_mask_time_overlap\": false,\n",
340
+ " \"num_attention_heads\": 12,\n",
341
+ " \"num_codevector_groups\": 2,\n",
342
+ " \"num_codevectors_per_group\": 320,\n",
343
+ " \"num_conv_pos_embedding_groups\": 16,\n",
344
+ " \"num_conv_pos_embeddings\": 128,\n",
345
+ " \"num_feat_extract_layers\": 7,\n",
346
+ " \"num_hidden_layers\": 12,\n",
347
+ " \"num_negatives\": 100,\n",
348
+ " \"pad_token_id\": 19,\n",
349
+ " \"proj_codevector_dim\": 256,\n",
350
+ " \"transformers_version\": \"4.11.3\",\n",
351
+ " \"use_weighted_layer_sum\": false,\n",
352
+ " \"vocab_size\": 32\n",
353
+ "}\n",
354
+ "\n",
355
+ "loading weights file https://huggingface.co/facebook/wav2vec2-base/resolve/main/pytorch_model.bin from cache at /home/sharpcoder/.cache/huggingface/transformers/ef45231897ce572a660ebc5a63d3702f1a6041c4c5fb78cbec330708531939b3.fcae05302a685f7904c551c8ea571e8bc2a2c4a1777ea81ad66e47f7883a650a\n",
356
+ "Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2ForCTC: ['project_q.bias', 'project_hid.bias', 'quantizer.codevectors', 'project_q.weight', 'quantizer.weight_proj.weight', 'quantizer.weight_proj.bias', 'project_hid.weight']\n",
357
+ "- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
358
+ "- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
359
+ "Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.bias', 'lm_head.weight']\n",
360
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
361
+ ]
362
+ }
363
+ ],
364
+ "source": [
365
+ "from transformers import Wav2Vec2ForCTC\n",
366
+ "\n",
367
+ "model = Wav2Vec2ForCTC.from_pretrained(\n",
368
+ " \"facebook/wav2vec2-base\",\n",
369
+ " ctc_loss_reduction=\"mean\", \n",
370
+ " pad_token_id=processor.tokenizer.pad_token_id,\n",
371
+ ")"
372
+ ]
373
+ },
374
+ {
375
+ "cell_type": "code",
376
+ "execution_count": 45,
377
+ "id": "208eac7d-9fdd-4c82-b46f-25c1a1f246ee",
378
+ "metadata": {},
379
+ "outputs": [
380
+ {
381
+ "name": "stderr",
382
+ "output_type": "stream",
383
+ "text": [
384
+ "PyTorch: setting up devices\n",
385
+ "The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).\n"
386
+ ]
387
+ }
388
+ ],
389
+ "source": [
390
+ "from transformers import TrainingArguments\n",
391
+ "from transformers import Trainer\n",
392
+ "\n",
393
+ "training_args = TrainingArguments(\n",
394
+ " output_dir=\"sharpcoder/wav2vec2_bjorn\",\n",
395
+ " group_by_length=True,\n",
396
+ " per_device_train_batch_size=8,\n",
397
+ " evaluation_strategy=\"steps\",\n",
398
+ " num_train_epochs=30,\n",
399
+ " fp16=False,\n",
400
+ " gradient_checkpointing=True,\n",
401
+ " save_steps=500,\n",
402
+ " eval_steps=500,\n",
403
+ " logging_steps=500,\n",
404
+ " learning_rate=1e-4,\n",
405
+ " weight_decay=0.005,\n",
406
+ " warmup_steps=1000,\n",
407
+ " save_total_limit=2,\n",
408
+ ")\n",
409
+ "\n",
410
+ "trainer = Trainer(\n",
411
+ " model=model,\n",
412
+ " data_collator=data_collator,\n",
413
+ " args=training_args,\n",
414
+ " compute_metrics=compute_metrics,\n",
415
+ " train_dataset=ds_prepared[\"train\"],\n",
416
+ " eval_dataset=ds_prepared[\"train\"],\n",
417
+ " tokenizer=processor.feature_extractor,\n",
418
+ ")"
419
+ ]
420
+ },
421
+ {
422
+ "cell_type": "code",
423
+ "execution_count": 46,
424
+ "id": "d58f6b8c-441c-4fa9-a308-e687948875e1",
425
+ "metadata": {},
426
+ "outputs": [
427
+ {
428
+ "name": "stderr",
429
+ "output_type": "stream",
430
+ "text": [
431
+ "The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.\n",
432
+ "***** Running training *****\n",
433
+ " Num examples = 1\n",
434
+ " Num Epochs = 30\n",
435
+ " Instantaneous batch size per device = 8\n",
436
+ " Total train batch size (w. parallel, distributed & accumulation) = 8\n",
437
+ " Gradient Accumulation steps = 1\n",
438
+ " Total optimization steps = 30\n",
439
+ "/home/sharpcoder/.local/lib/python3.10/site-packages/transformers/feature_extraction_utils.py:158: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at ../torch/csrc/utils/tensor_new.cpp:210.)\n",
440
+ " tensor = as_tensor(value)\n",
441
+ "/home/sharpcoder/.local/lib/python3.10/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py:882: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').\n",
442
+ " return (input_length - kernel_size) // stride + 1\n",
443
+ "/home/sharpcoder/.local/lib/python3.10/site-packages/torch/autocast_mode.py:162: UserWarning: User provided device_type of 'cuda', but CUDA is not available. Disabling\n",
444
+ " warnings.warn('User provided device_type of \\'cuda\\', but CUDA is not available. Disabling')\n"
445
+ ]
446
+ },
447
+ {
448
+ "data": {
449
+ "text/html": [
450
+ "\n",
451
+ " <div>\n",
452
+ " \n",
453
+ " <progress value='30' max='30' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
454
+ " [30/30 00:29, Epoch 30/30]\n",
455
+ " </div>\n",
456
+ " <table border=\"1\" class=\"dataframe\">\n",
457
+ " <thead>\n",
458
+ " <tr style=\"text-align: left;\">\n",
459
+ " <th>Step</th>\n",
460
+ " <th>Training Loss</th>\n",
461
+ " <th>Validation Loss</th>\n",
462
+ " </tr>\n",
463
+ " </thead>\n",
464
+ " <tbody>\n",
465
+ " </tbody>\n",
466
+ "</table><p>"
467
+ ],
468
+ "text/plain": [
469
+ "<IPython.core.display.HTML object>"
470
+ ]
471
+ },
472
+ "metadata": {},
473
+ "output_type": "display_data"
474
+ },
475
+ {
476
+ "name": "stderr",
477
+ "output_type": "stream",
478
+ "text": [
479
+ "\n",
480
+ "\n",
481
+ "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
482
+ "\n",
483
+ "\n"
484
+ ]
485
+ },
486
+ {
487
+ "data": {
488
+ "text/plain": [
489
+ "TrainOutput(global_step=30, training_loss=7.301008097330729, metrics={'train_runtime': 30.6312, 'train_samples_per_second': 0.979, 'train_steps_per_second': 0.979, 'total_flos': 943749864316800.0, 'train_loss': 7.301008097330729, 'epoch': 30.0})"
490
+ ]
491
+ },
492
+ "execution_count": 46,
493
+ "metadata": {},
494
+ "output_type": "execute_result"
495
+ }
496
+ ],
497
+ "source": [
498
+ "trainer.train()"
499
+ ]
500
+ },
501
+ {
502
+ "cell_type": "code",
503
+ "execution_count": 47,
504
+ "id": "70866f1f-3745-4e68-acd5-f50b6eff348b",
505
+ "metadata": {},
506
+ "outputs": [
507
+ {
508
+ "name": "stderr",
509
+ "output_type": "stream",
510
+ "text": [
511
+ "Saving model checkpoint to sharpcoder/wav2vec2_bjorn\n",
512
+ "Configuration saved in sharpcoder/wav2vec2_bjorn/config.json\n",
513
+ "Model weights saved in sharpcoder/wav2vec2_bjorn/pytorch_model.bin\n",
514
+ "Configuration saved in sharpcoder/wav2vec2_bjorn/preprocessor_config.json\n"
515
+ ]
516
+ },
517
+ {
518
+ "ename": "AttributeError",
519
+ "evalue": "'Trainer' object has no attribute 'repo'",
520
+ "output_type": "error",
521
+ "traceback": [
522
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
523
+ "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
524
+ "Input \u001b[0;32mIn [47]\u001b[0m, in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpush_to_hub\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
525
+ "File \u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/trainer.py:2677\u001b[0m, in \u001b[0;36mTrainer.push_to_hub\u001b[0;34m(self, commit_message, blocking, **kwargs)\u001b[0m\n\u001b[1;32m 2674\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mis_world_process_zero():\n\u001b[1;32m 2675\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[0;32m-> 2677\u001b[0m git_head_commit_url \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrepo\u001b[49m\u001b[38;5;241m.\u001b[39mpush_to_hub(commit_message\u001b[38;5;241m=\u001b[39mcommit_message, blocking\u001b[38;5;241m=\u001b[39mblocking)\n\u001b[1;32m 2678\u001b[0m \u001b[38;5;66;03m# push separately the model card to be independant from the rest of the model\u001b[39;00m\n\u001b[1;32m 2679\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mshould_save:\n",
526
+ "\u001b[0;31mAttributeError\u001b[0m: 'Trainer' object has no attribute 'repo'"
527
+ ]
528
+ }
529
+ ],
530
+ "source": []
531
+ },
532
+ {
533
+ "cell_type": "code",
534
+ "execution_count": null,
535
+ "id": "333d43cf-add3-4d78-bbca-b44c638519fe",
536
+ "metadata": {},
537
+ "outputs": [],
538
+ "source": []
539
+ }
540
+ ],
541
+ "metadata": {
542
+ "kernelspec": {
543
+ "display_name": "Python 3 (ipykernel)",
544
+ "language": "python",
545
+ "name": "python3"
546
+ },
547
+ "language_info": {
548
+ "codemirror_mode": {
549
+ "name": "ipython",
550
+ "version": 3
551
+ },
552
+ "file_extension": ".py",
553
+ "mimetype": "text/x-python",
554
+ "name": "python",
555
+ "nbconvert_exporter": "python",
556
+ "pygments_lexer": "ipython3",
557
+ "version": "3.10.4"
558
+ }
559
+ },
560
+ "nbformat": 4,
561
+ "nbformat_minor": 5
562
+ }
sharpcoder/wav2vec2_bjorn/config.json ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/wav2vec2-base",
3
+ "activation_dropout": 0.0,
4
+ "apply_spec_augment": true,
5
+ "architectures": [
6
+ "Wav2Vec2ForCTC"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "bos_token_id": 1,
10
+ "classifier_proj_size": 256,
11
+ "codevector_dim": 256,
12
+ "contrastive_logits_temperature": 0.1,
13
+ "conv_bias": false,
14
+ "conv_dim": [
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512,
19
+ 512,
20
+ 512,
21
+ 512
22
+ ],
23
+ "conv_kernel": [
24
+ 10,
25
+ 3,
26
+ 3,
27
+ 3,
28
+ 3,
29
+ 2,
30
+ 2
31
+ ],
32
+ "conv_stride": [
33
+ 5,
34
+ 2,
35
+ 2,
36
+ 2,
37
+ 2,
38
+ 2,
39
+ 2
40
+ ],
41
+ "ctc_loss_reduction": "mean",
42
+ "ctc_zero_infinity": false,
43
+ "diversity_loss_weight": 0.1,
44
+ "do_stable_layer_norm": false,
45
+ "eos_token_id": 2,
46
+ "feat_extract_activation": "gelu",
47
+ "feat_extract_norm": "group",
48
+ "feat_proj_dropout": 0.1,
49
+ "feat_quantizer_dropout": 0.0,
50
+ "final_dropout": 0.0,
51
+ "freeze_feat_extract_train": true,
52
+ "hidden_act": "gelu",
53
+ "hidden_dropout": 0.1,
54
+ "hidden_size": 768,
55
+ "initializer_range": 0.02,
56
+ "intermediate_size": 3072,
57
+ "layer_norm_eps": 1e-05,
58
+ "layerdrop": 0.0,
59
+ "mask_channel_length": 10,
60
+ "mask_channel_min_space": 1,
61
+ "mask_channel_other": 0.0,
62
+ "mask_channel_prob": 0.0,
63
+ "mask_channel_selection": "static",
64
+ "mask_feature_length": 10,
65
+ "mask_feature_prob": 0.0,
66
+ "mask_time_length": 10,
67
+ "mask_time_min_space": 1,
68
+ "mask_time_other": 0.0,
69
+ "mask_time_prob": 0.05,
70
+ "mask_time_selection": "static",
71
+ "model_type": "wav2vec2",
72
+ "no_mask_channel_overlap": false,
73
+ "no_mask_time_overlap": false,
74
+ "num_attention_heads": 12,
75
+ "num_codevector_groups": 2,
76
+ "num_codevectors_per_group": 320,
77
+ "num_conv_pos_embedding_groups": 16,
78
+ "num_conv_pos_embeddings": 128,
79
+ "num_feat_extract_layers": 7,
80
+ "num_hidden_layers": 12,
81
+ "num_negatives": 100,
82
+ "pad_token_id": 19,
83
+ "proj_codevector_dim": 256,
84
+ "torch_dtype": "float32",
85
+ "transformers_version": "4.11.3",
86
+ "use_weighted_layer_sum": false,
87
+ "vocab_size": 32
88
+ }
sharpcoder/wav2vec2_bjorn/preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "return_attention_mask": false,
8
+ "sampling_rate": 16000
9
+ }
sharpcoder/wav2vec2_bjorn/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:873bf552da3de5ce2fc1efbe234017f06cf7b9b70812d408585136c69486cb81
3
+ size 377667031
sharpcoder/wav2vec2_bjorn/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b61aecf82c993254e7b0fbeb1c240469688a2bf27cd91d288ef05824cd7c911
3
+ size 2799
vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"w": 0, "y": 1, "m": 2, "i": 3, "e": 4, "s": 5, "r": 6, "p": 7, "n": 8, "a": 9, "h": 10, ".": 11, "j": 12, "b": 13, "d": 14, "l": 15, "o": 16, "|": 17, "[UNK]": 18, "[PAD]": 19}