{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "5205c0d3-2272-4a43-9345-9553af479fe6", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "7afc85b57ea24d31a2fdcc2b1f5c9ace", "version_major": 2, "version_minor": 0 }, "text/plain": [ "VBox(children=(HTML(value='
Dict[str, torch.Tensor]:\n", " # split inputs and labels since they have to be of different lenghts and need\n", " # different padding methods\n", " input_features = [{\"input_values\": feature[\"input_values\"]} for feature in features]\n", " label_features = [{\"input_ids\": feature[\"labels\"]} for feature in features]\n", "\n", " batch = self.processor.pad(\n", " input_features,\n", " padding=self.padding,\n", " return_tensors=\"pt\",\n", " )\n", " with self.processor.as_target_processor():\n", " labels_batch = self.processor.pad(\n", " label_features,\n", " padding=self.padding,\n", " return_tensors=\"pt\",\n", " )\n", "\n", " # replace padding with -100 to ignore loss correctly\n", " labels = labels_batch[\"input_ids\"].masked_fill(labels_batch.attention_mask.ne(1), -100)\n", "\n", " batch[\"labels\"] = labels\n", "\n", " return batch\n", " \n", "def compute_metrics(pred):\n", " pred_logits = pred.predictions\n", " pred_ids = np.argmax(pred_logits, axis=-1)\n", "\n", " pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id\n", "\n", " pred_str = processor.batch_decode(pred_ids)\n", " # we do not want to group tokens when computing the metrics\n", " label_str = processor.batch_decode(pred.label_ids, group_tokens=False)\n", "\n", " wer = wer_metric.compute(predictions=pred_str, references=label_str)\n", "\n", " return {\"wer\": wer}" ] }, { "cell_type": "code", "execution_count": 49, "id": "1025ffdf-cb83-4895-89ab-a98bc3fab642", "metadata": {}, "outputs": [], "source": [ "data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)\n", "wer_metric = load_metric(\"wer\")" ] }, { "cell_type": "code", "execution_count": 58, "id": "71351cf4-6d00-40ae-89cc-cedb87073625", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "loading configuration file https://huggingface.co/facebook/wav2vec2-base-960h/resolve/main/config.json from cache at /home/sharpcoder/.cache/huggingface/transformers/cbb3014bb9f03ead9b94f4a791ff8e777465307670e85079d35e28cbc5d88727.0e2d739358c9b58747bd19db5f9f4320dacabbeb1e6282f5cc1069c5c55a82d2\n", "Model config Wav2Vec2Config {\n", " \"_name_or_path\": \"facebook/wav2vec2-base-960h\",\n", " \"activation_dropout\": 0.1,\n", " \"apply_spec_augment\": true,\n", " \"architectures\": [\n", " \"Wav2Vec2ForCTC\"\n", " ],\n", " \"attention_dropout\": 0.1,\n", " \"bos_token_id\": 1,\n", " \"classifier_proj_size\": 256,\n", " \"codevector_dim\": 256,\n", " \"contrastive_logits_temperature\": 0.1,\n", " \"conv_bias\": false,\n", " \"conv_dim\": [\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 512\n", " ],\n", " \"conv_kernel\": [\n", " 10,\n", " 3,\n", " 3,\n", " 3,\n", " 3,\n", " 2,\n", " 2\n", " ],\n", " \"conv_stride\": [\n", " 5,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2\n", " ],\n", " \"ctc_loss_reduction\": \"mean\",\n", " \"ctc_zero_infinity\": false,\n", " \"diversity_loss_weight\": 0.1,\n", " \"do_stable_layer_norm\": false,\n", " \"eos_token_id\": 2,\n", " \"feat_extract_activation\": \"gelu\",\n", " \"feat_extract_dropout\": 0.0,\n", " \"feat_extract_norm\": \"group\",\n", " \"feat_proj_dropout\": 0.1,\n", " \"feat_quantizer_dropout\": 0.0,\n", " \"final_dropout\": 0.1,\n", " \"gradient_checkpointing\": false,\n", " \"hidden_act\": \"gelu\",\n", " \"hidden_dropout\": 0.1,\n", " \"hidden_dropout_prob\": 0.1,\n", " \"hidden_size\": 768,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 3072,\n", " \"layer_norm_eps\": 1e-05,\n", " \"layerdrop\": 0.1,\n", " \"mask_feature_length\": 10,\n", " \"mask_feature_prob\": 0.0,\n", " \"mask_time_length\": 10,\n", " \"mask_time_prob\": 0.05,\n", " \"model_type\": \"wav2vec2\",\n", " \"num_attention_heads\": 12,\n", " \"num_codevector_groups\": 2,\n", " \"num_codevectors_per_group\": 320,\n", " \"num_conv_pos_embedding_groups\": 16,\n", " \"num_conv_pos_embeddings\": 128,\n", " \"num_feat_extract_layers\": 7,\n", " \"num_hidden_layers\": 12,\n", " \"num_negatives\": 100,\n", " \"pad_token_id\": 19,\n", " \"proj_codevector_dim\": 256,\n", " \"transformers_version\": \"4.11.3\",\n", " \"use_weighted_layer_sum\": false,\n", " \"vocab_size\": 32\n", "}\n", "\n", "loading weights file https://huggingface.co/facebook/wav2vec2-base-960h/resolve/main/pytorch_model.bin from cache at /home/sharpcoder/.cache/huggingface/transformers/4cb133d3cf3e58e8a4e088b1fc826611a3bcf3d98b20a0bb49ce8cd5362411b7.beeaccfa4baf44ba6123c23938d8a17f48344361a5e7041782e537dfd78a2037\n", "All model checkpoint weights were used when initializing Wav2Vec2ForCTC.\n", "\n", "Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] } ], "source": [ "from transformers import Wav2Vec2ForCTC\n", "\n", "model = Wav2Vec2ForCTC.from_pretrained(\n", " #\"facebook/wav2vec2-base\",\n", " \"facebook/wav2vec2-base-960h\",\n", " ctc_loss_reduction=\"mean\", \n", " pad_token_id=processor.tokenizer.pad_token_id,\n", ")" ] }, { "cell_type": "code", "execution_count": 59, "id": "208eac7d-9fdd-4c82-b46f-25c1a1f246ee", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "PyTorch: setting up devices\n", "The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).\n" ] } ], "source": [ "from transformers import TrainingArguments\n", "from transformers import Trainer\n", "\n", "training_args = TrainingArguments(\n", " output_dir=\"./\",\n", " group_by_length=True,\n", " per_device_train_batch_size=8,\n", " evaluation_strategy=\"steps\",\n", " num_train_epochs=2,\n", " fp16=False,\n", " gradient_checkpointing=True,\n", " save_steps=500,\n", " eval_steps=500,\n", " logging_steps=500,\n", " learning_rate=1e-4,\n", " weight_decay=0.005,\n", " warmup_steps=1000,\n", " save_total_limit=2,\n", ")\n", "\n", "trainer = Trainer(\n", " model=model,\n", " data_collator=data_collator,\n", " args=training_args,\n", " compute_metrics=compute_metrics,\n", " train_dataset=ds_prepared[\"train\"],\n", " eval_dataset=ds_prepared[\"train\"],\n", " tokenizer=processor.feature_extractor,\n", ")" ] }, { "cell_type": "code", "execution_count": 60, "id": "d58f6b8c-441c-4fa9-a308-e687948875e1", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.\n", "***** Running training *****\n", " Num examples = 1\n", " Num Epochs = 2\n", " Instantaneous batch size per device = 8\n", " Total train batch size (w. parallel, distributed & accumulation) = 8\n", " Gradient Accumulation steps = 1\n", " Total optimization steps = 2\n" ] }, { "data": { "text/html": [ "\n", "
\n", " \n", " \n", " [2/2 00:01, Epoch 2/2]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StepTraining LossValidation Loss

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "\n", "Training completed. Do not forget to share your model on huggingface.co/models =)\n", "\n", "\n" ] }, { "data": { "text/plain": [ "TrainOutput(global_step=2, training_loss=16.662765502929688, metrics={'train_runtime': 1.915, 'train_samples_per_second': 1.044, 'train_steps_per_second': 1.044, 'total_flos': 62916657621120.0, 'train_loss': 16.662765502929688, 'epoch': 2.0})" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "trainer.train()" ] }, { "cell_type": "code", "execution_count": 61, "id": "333d43cf-add3-4d78-bbca-b44c638519fe", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Saving model checkpoint to ./\n", "Configuration saved in ./config.json\n", "Model weights saved in ./pytorch_model.bin\n", "Configuration saved in ./preprocessor_config.json\n" ] }, { "ename": "AttributeError", "evalue": "'Trainer' object has no attribute 'repo'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", "Input \u001b[0;32mIn [61]\u001b[0m, in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpush_to_hub\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhub_model_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msharpcoder/wav2vec2_bjorn\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/trainer.py:2677\u001b[0m, in \u001b[0;36mTrainer.push_to_hub\u001b[0;34m(self, commit_message, blocking, **kwargs)\u001b[0m\n\u001b[1;32m 2674\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mis_world_process_zero():\n\u001b[1;32m 2675\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[0;32m-> 2677\u001b[0m git_head_commit_url \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrepo\u001b[49m\u001b[38;5;241m.\u001b[39mpush_to_hub(commit_message\u001b[38;5;241m=\u001b[39mcommit_message, blocking\u001b[38;5;241m=\u001b[39mblocking)\n\u001b[1;32m 2678\u001b[0m \u001b[38;5;66;03m# push separately the model card to be independant from the rest of the model\u001b[39;00m\n\u001b[1;32m 2679\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mshould_save:\n", "\u001b[0;31mAttributeError\u001b[0m: 'Trainer' object has no attribute 'repo'" ] } ], "source": [ "trainer.push_to_hub(hub_model_id=\"sharpcoder/wav2vec2_bjorn\")" ] }, { "cell_type": "code", "execution_count": null, "id": "a5cb9a88-2443-4bd9-85ac-12bf80a9e325", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.4" } }, "nbformat": 4, "nbformat_minor": 5 }