update

Files changed (5) hide show

Generate.ipynb +64 -35
config.json +1 -1
pytorch_model.bin +1 -1
training_args.bin +1 -1
vocab.json +1 -1

Generate.ipynb CHANGED Viewed

@@ -28,7 +28,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "id": "38bdf299-f60d-43ea-9230-df1be861e406",
    "metadata": {},
    "outputs": [
@@ -36,14 +36,14 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Using custom data configuration sharpcoder--bjorn_training-8c32a3534606a113\n",
-      "Reusing dataset parquet (/home/sharpcoder/.cache/huggingface/datasets/sharpcoder___parquet/sharpcoder--bjorn_training-8c32a3534606a113/0.0.0/7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8)\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "18cae671f8fd4f9baac804c91fee03bf",
        "version_major": 2,
        "version_minor": 0
       },
@@ -62,14 +62,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
    "id": "75b32151-eb53-4476-8c1f-7e6da72e173e",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "0611b2fa6cf740d6925d03cf3ba525a2",
        "version_major": 2,
        "version_minor": 0
       },
@@ -102,7 +102,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
    "id": "d214872e-d4b1-4aa7-be07-8a1591961968",
    "metadata": {},
    "outputs": [],
@@ -111,60 +111,89 @@
     "from transformers import Wav2Vec2FeatureExtractor\n",
     "from transformers import Wav2Vec2Processor\n",
     "\n",
-    "tokenizer = Wav2Vec2CTCTokenizer(\"./vocab.json\", unk_token=\"[UNK]\", pad_token=\"[PAD]\", word_delimiter_token=\" \")\n",
     "feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)\n",
     "processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
    "id": "e906c45f-6971-43c3-ad0a-b13363100bdf",
    "metadata": {},
    "outputs": [],
    "source": [
     "def prepare_dataset(batch):\n",
-    "    audio = batch[\"audio\"]\n",
-    "\n",
     "    # batched output is \"un-batched\" to ensure mapping is correct\n",
     "    batch[\"input_values\"] = processor(audio[\"array\"], sampling_rate=audio[\"sample_rate\"]).input_values[0]\n",
     "    batch[\"input_length\"] = len(batch[\"input_values\"])\n",
     "    \n",
     "    with processor.as_target_processor():\n",
     "        batch[\"labels\"] = processor(batch[\"text\"]).input_ids\n",
-    "    return batch"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
-   "id": "8c083db6-eab5-4f25-9a08-eab50d2d30ac",
    "metadata": {},
    "outputs": [
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.\n"
-     ]
-    },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "ae21f7b6a50241e4ab4dd2b5c7c5689c",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "  0%|          | 0/1 [00:00<?, ?ex/s]"
       ]
      },
      "metadata": {},
      "output_type": "display_data"
     }
    ],
    "source": [
-    "ds_prepared = ds.map(prepare_dataset, remove_columns=ds.column_names[\"train\"], num_proc=4)"
    ]
   },
   {
@@ -253,7 +282,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 54,
    "id": "71351cf4-6d00-40ae-89cc-cedb87073625",
    "metadata": {},
    "outputs": [
@@ -363,7 +392,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 55,
    "id": "208eac7d-9fdd-4c82-b46f-25c1a1f246ee",
    "metadata": {},
    "outputs": [
@@ -385,7 +414,7 @@
     "  group_by_length=True,\n",
     "  per_device_train_batch_size=8,\n",
     "  evaluation_strategy=\"steps\",\n",
-    "  num_train_epochs=30,\n",
     "  fp16=False,\n",
     "  gradient_checkpointing=True,\n",
     "  save_steps=500,\n",
@@ -410,7 +439,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 56,
    "id": "d58f6b8c-441c-4fa9-a308-e687948875e1",
    "metadata": {},
    "outputs": [
@@ -421,11 +450,11 @@
       "The following columns in the training set  don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.\n",
       "***** Running training *****\n",
       "  Num examples = 1\n",
-      "  Num Epochs = 30\n",
       "  Instantaneous batch size per device = 8\n",
       "  Total train batch size (w. parallel, distributed & accumulation) = 8\n",
       "  Gradient Accumulation steps = 1\n",
-      "  Total optimization steps = 30\n"
      ]
     },
     {
@@ -434,8 +463,8 @@
        "\n",
        "    <div>\n",
        "      \n",
-       "      <progress value='30' max='30' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
-       "      [30/30 00:28, Epoch 30/30]\n",
        "    </div>\n",
        "    <table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
@@ -470,10 +499,10 @@
     {
      "data": {
       "text/plain": [
-       "TrainOutput(global_step=30, training_loss=16.291970825195314, metrics={'train_runtime': 29.1768, 'train_samples_per_second': 1.028, 'train_steps_per_second': 1.028, 'total_flos': 943749864316800.0, 'train_loss': 16.291970825195314, 'epoch': 30.0})"
       ]
      },
-     "execution_count": 56,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -484,7 +513,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 57,
    "id": "333d43cf-add3-4d78-bbca-b44c638519fe",
    "metadata": {},
    "outputs": [
@@ -505,7 +534,7 @@
      "traceback": [
       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
       "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
-      "Input \u001b[0;32mIn [57]\u001b[0m, in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpush_to_hub\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhub_model_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msharpcoder/wav2vec2_bjorn\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
       "File \u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/trainer.py:2677\u001b[0m, in \u001b[0;36mTrainer.push_to_hub\u001b[0;34m(self, commit_message, blocking, **kwargs)\u001b[0m\n\u001b[1;32m   2674\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mis_world_process_zero():\n\u001b[1;32m   2675\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[0;32m-> 2677\u001b[0m git_head_commit_url \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrepo\u001b[49m\u001b[38;5;241m.\u001b[39mpush_to_hub(commit_message\u001b[38;5;241m=\u001b[39mcommit_message, blocking\u001b[38;5;241m=\u001b[39mblocking)\n\u001b[1;32m   2678\u001b[0m \u001b[38;5;66;03m# push separately the model card to be independant from the rest of the model\u001b[39;00m\n\u001b[1;32m   2679\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mshould_save:\n",
       "\u001b[0;31mAttributeError\u001b[0m: 'Trainer' object has no attribute 'repo'"
      ]

   },
   {
    "cell_type": "code",
+   "execution_count": 87,
    "id": "38bdf299-f60d-43ea-9230-df1be861e406",
    "metadata": {},
    "outputs": [
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "Using custom data configuration sharpcoder--bjorn_training-49dfdd879ea26ec8\n",
+      "Reusing dataset parquet (/home/sharpcoder/.cache/huggingface/datasets/sharpcoder___parquet/sharpcoder--bjorn_training-49dfdd879ea26ec8/0.0.0/7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8)\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7ace7bfb1aee4fe3946da3cf4616edb6",
        "version_major": 2,
        "version_minor": 0
       },
   },
   {
    "cell_type": "code",
+   "execution_count": 89,
    "id": "75b32151-eb53-4476-8c1f-7e6da72e173e",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5d7fae2ce3cb4c9c9be58bf46d991f3d",
        "version_major": 2,
        "version_minor": 0
       },
   },
   {
    "cell_type": "code",
+   "execution_count": 90,
    "id": "d214872e-d4b1-4aa7-be07-8a1591961968",
    "metadata": {},
    "outputs": [],
     "from transformers import Wav2Vec2FeatureExtractor\n",
     "from transformers import Wav2Vec2Processor\n",
     "\n",
+    "tokenizer = Wav2Vec2CTCTokenizer(\"./vocab.json\", unk_token=\"[UNK]\", pad_token=\"[PAD]\", word_delimiter_token=\"|\")\n",
     "feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)\n",
     "processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 107,
    "id": "e906c45f-6971-43c3-ad0a-b13363100bdf",
    "metadata": {},
    "outputs": [],
    "source": [
     "def prepare_dataset(batch):\n",
+    "    audio = batch[\"audio\"][0]\n",
     "    # batched output is \"un-batched\" to ensure mapping is correct\n",
     "    batch[\"input_values\"] = processor(audio[\"array\"], sampling_rate=audio[\"sample_rate\"]).input_values[0]\n",
     "    batch[\"input_length\"] = len(batch[\"input_values\"])\n",
     "    \n",
     "    with processor.as_target_processor():\n",
     "        batch[\"labels\"] = processor(batch[\"text\"]).input_ids\n",
+    "    return batch\n"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 108,
+   "id": "859e5dce-cb41-4647-98cb-de084a4b9d7e",
    "metadata": {},
    "outputs": [
     {
+     "data": {
+      "text/plain": [
+       "['audio', 'text']"
+      ]
+     },
+     "execution_count": 108,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ds.column_names['train']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 109,
+   "id": "8c083db6-eab5-4f25-9a08-eab50d2d30ac",
+   "metadata": {},
+   "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7575024d6ef048e39d7b25654f993957",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
+       "  0%|          | 0/4 [00:00<?, ?ex/s]"
       ]
      },
      "metadata": {},
      "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "IOPub data rate exceeded.\n",
+      "The Jupyter server will temporarily stop sending output\n",
+      "to the client in order to avoid crashing it.\n",
+      "To change this limit, set the config variable\n",
+      "`--ServerApp.iopub_data_rate_limit`.\n",
+      "\n",
+      "Current values:\n",
+      "ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)\n",
+      "ServerApp.rate_limit_window=3.0 (secs)\n",
+      "\n"
+     ]
     }
    ],
    "source": [
+    "ds_prepared = ds.map(prepare_dataset, remove_columns=ds.column_names[\"train\"], num_proc=1)"
    ]
   },
   {
   },
   {
    "cell_type": "code",
+   "execution_count": 58,
    "id": "71351cf4-6d00-40ae-89cc-cedb87073625",
    "metadata": {},
    "outputs": [
   },
   {
    "cell_type": "code",
+   "execution_count": 59,
    "id": "208eac7d-9fdd-4c82-b46f-25c1a1f246ee",
    "metadata": {},
    "outputs": [
     "  group_by_length=True,\n",
     "  per_device_train_batch_size=8,\n",
     "  evaluation_strategy=\"steps\",\n",
+    "  num_train_epochs=2,\n",
     "  fp16=False,\n",
     "  gradient_checkpointing=True,\n",
     "  save_steps=500,\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 60,
    "id": "d58f6b8c-441c-4fa9-a308-e687948875e1",
    "metadata": {},
    "outputs": [
       "The following columns in the training set  don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.\n",
       "***** Running training *****\n",
       "  Num examples = 1\n",
+      "  Num Epochs = 2\n",
       "  Instantaneous batch size per device = 8\n",
       "  Total train batch size (w. parallel, distributed & accumulation) = 8\n",
       "  Gradient Accumulation steps = 1\n",
+      "  Total optimization steps = 2\n"
      ]
     },
     {
        "\n",
        "    <div>\n",
        "      \n",
+       "      <progress value='2' max='2' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [2/2 00:01, Epoch 2/2]\n",
        "    </div>\n",
        "    <table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
     {
      "data": {
       "text/plain": [
+       "TrainOutput(global_step=2, training_loss=16.662765502929688, metrics={'train_runtime': 1.915, 'train_samples_per_second': 1.044, 'train_steps_per_second': 1.044, 'total_flos': 62916657621120.0, 'train_loss': 16.662765502929688, 'epoch': 2.0})"
       ]
      },
+     "execution_count": 60,
      "metadata": {},
      "output_type": "execute_result"
     }
   },
   {
    "cell_type": "code",
+   "execution_count": 61,
    "id": "333d43cf-add3-4d78-bbca-b44c638519fe",
    "metadata": {},
    "outputs": [
      "traceback": [
       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
       "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "Input \u001b[0;32mIn [61]\u001b[0m, in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpush_to_hub\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhub_model_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msharpcoder/wav2vec2_bjorn\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
       "File \u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/trainer.py:2677\u001b[0m, in \u001b[0;36mTrainer.push_to_hub\u001b[0;34m(self, commit_message, blocking, **kwargs)\u001b[0m\n\u001b[1;32m   2674\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mis_world_process_zero():\n\u001b[1;32m   2675\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[0;32m-> 2677\u001b[0m git_head_commit_url \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrepo\u001b[49m\u001b[38;5;241m.\u001b[39mpush_to_hub(commit_message\u001b[38;5;241m=\u001b[39mcommit_message, blocking\u001b[38;5;241m=\u001b[39mblocking)\n\u001b[1;32m   2678\u001b[0m \u001b[38;5;66;03m# push separately the model card to be independant from the rest of the model\u001b[39;00m\n\u001b[1;32m   2679\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mshould_save:\n",
       "\u001b[0;31mAttributeError\u001b[0m: 'Trainer' object has no attribute 'repo'"
      ]

config.json CHANGED Viewed

@@ -71,7 +71,7 @@
   "num_feat_extract_layers": 7,
   "num_hidden_layers": 12,
   "num_negatives": 100,
-  "pad_token_id": 19,
   "proj_codevector_dim": 256,
   "torch_dtype": "float32",
   "transformers_version": "4.11.3",

   "num_feat_extract_layers": 7,
   "num_hidden_layers": 12,
   "num_negatives": 100,
+  "pad_token_id": 26,
   "proj_codevector_dim": 256,
   "torch_dtype": "float32",
   "transformers_version": "4.11.3",

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6e55a1042cf8cc902bd68a3e8798f90a9cbb95b313e4a3c79082b4dc9d0fc05f
 size 377667031

 version https://git-lfs.github.com/spec/v1
+oid sha256:124b58d6d3dbb0ca4f29d31fad8c5ad9a70bc43d141b954dd380d21dfebedc17
 size 377667031

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:eb6aaa145951105af08a3ef5d6fd296d211fe596176abfe9aee5116147e093b5
 size 2735

 version https://git-lfs.github.com/spec/v1
+oid sha256:d088aa47e4ea7b9e0e9873e040b77e2eb035cf9848d076792a823f0550eed203
 size 2735

vocab.json CHANGED Viewed

	@@ -1 +1 @@
1	- {"w": 0, "a": 1, "o": 3, "e": 4, "j": 5, "n": 6, "p": 7, "l": 8, ".": 9, "i": 10, "b": 11, "d": 12, "h": 13, "r": 14, "y": 15, "m": 16, "s": 17, "\|": 2, "[UNK]": 18, "[PAD]": 19}


1	+ {"i": 1, "e": 2, "p": 3, "h": 4, "c": 5, "r": 6, "x": 7, "m": 8, "v": 9, "w": 10, "\|": 0, "j": 12, ".": 13, "d": 14, "y": 15, "a": 16, "f": 17, "s": 18, "l": 19, "u": 20, "o": 21, "n": 22, "b": 23, "t": 24, "g": 25, "[UNK]": 25, "[PAD]": 26}