{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "p5S2GYrJe6lb" }, "source": [ "# Image to text for Airbnb images" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "lG3i-iiWe7l_" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/cassandra@myliser.lu/env/venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "import torch\n", "import torch\n", "from torch.utils.data import Dataset\n", "from PIL import Image\n", "import pandas as pd\n", "from transformers import AutoProcessor\n", "import numpy as np\n", "from torchvision import transforms\n", "from transformers import BlipForConditionalGeneration\n" ] }, { "cell_type": "markdown", "metadata": { "id": "FpRt69nWfFFv" }, "source": [ "### Create dataset with images and text and process them with BLIP's processor" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "id": "1i4BMba0ln91" }, "outputs": [], "source": [ "class Airbnb(Dataset):\n", " def __init__(self, csv_file, data_augmentation):\n", " self.df = pd.read_csv(csv_file)\n", " self.processor = AutoProcessor.from_pretrained(\"Salesforce/blip-image-captioning-base\")\n", " def __len__(self):\n", " return self.df.shape[0]\n", "\n", " def __getitem__(self, index):\n", " path_to_im = \"/home/cassandra@myliser.lu/image_to_text/blip/living_room/\" + str(self.df.listing_id_x[index])+ '_' + str(self.df.photo_number_x[index])\n", " image = Image.open(path_to_im).convert(\"RGB\")\n", " label = str(self.df.answers[index])\n", " encoding = self.processor(images=image, text=label, padding=\"max_length\", return_tensors=\"pt\")\n", " encoding = {k:v.squeeze() for k,v in encoding.items()}\n", " return encoding" ] }, { "cell_type": "markdown", "metadata": { "id": "e2sr84dsfXt7" }, "source": [ "### Import CSV file" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "id": "Zl0asqIYpp4-" }, "outputs": [], "source": [ "csv_file = \"/home/cassandra@myliser.lu/image_to_text/blip/Picture_Descriptions_All-Copy.csv\"" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "id": "8uUjuOj-qGsv" }, "outputs": [], "source": [ "dataset = Airbnb(csv_file, data_augmentation = None)" ] }, { "cell_type": "markdown", "metadata": { "id": "0IK-kRFxfd3H" }, "source": [ "### Split train/test dataset" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "id": "93wmNMwgqwgg" }, "outputs": [], "source": [ "train_size = int(0.8 * len(dataset))\n", "test_size = len(dataset) - train_size\n", "train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])" ] }, { "cell_type": "markdown", "metadata": { "id": "3VWdqSeWfhAN" }, "source": [ "### Create dataloader" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "id": "0pJdUuSTqy-5" }, "outputs": [], "source": [ "train_loader = torch.utils.data.DataLoader(\n", " train_dataset,\n", " batch_size=1,\n", " shuffle=True\n", " )\n", "test_loader = torch.utils.data.DataLoader(\n", " test_dataset,\n", " batch_size=1,\n", " shuffle=True\n", " )" ] }, { "cell_type": "markdown", "metadata": { "id": "mnwwxvB_fjlx" }, "source": [ "### Import model and create device" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "id": "jY6h9kpgq0KX" }, "outputs": [], "source": [ "model = BlipForConditionalGeneration.from_pretrained(\"Salesforce/blip-image-captioning-base\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "id": "9rk60pCKfUkV" }, "outputs": [], "source": [ "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")" ] }, { "cell_type": "markdown", "metadata": { "id": "HbiDQqzngCbn" }, "source": [ "### Train loop" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "i39jlG5Aq1Yo", "outputId": "a5292b17-f2b9-4a38-db0a-3f97d4923aa4" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch: 0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.\n" ] }, { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[9], line 25\u001b[0m\n\u001b[1;32m 22\u001b[0m total_examples \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m labels\u001b[38;5;241m.\u001b[39mnumel()\n\u001b[1;32m 24\u001b[0m loss\u001b[38;5;241m.\u001b[39mbackward()\n\u001b[0;32m---> 25\u001b[0m \u001b[43moptimizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstep\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 26\u001b[0m optimizer\u001b[38;5;241m.\u001b[39mzero_grad()\n\u001b[1;32m 28\u001b[0m average_loss \u001b[38;5;241m=\u001b[39m total_loss \u001b[38;5;241m/\u001b[39m \u001b[38;5;28mlen\u001b[39m(train_loader)\n", "File \u001b[0;32m~/env/venv/lib/python3.10/site-packages/torch/optim/optimizer.py:385\u001b[0m, in \u001b[0;36mOptimizer.profile_hook_step..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 380\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 381\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[1;32m 382\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfunc\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m must return None or a tuple of (new_args, new_kwargs), but got \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresult\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 383\u001b[0m )\n\u001b[0;32m--> 385\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 386\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_optimizer_step_code()\n\u001b[1;32m 388\u001b[0m \u001b[38;5;66;03m# call optimizer step post hooks\u001b[39;00m\n", "File \u001b[0;32m~/env/venv/lib/python3.10/site-packages/torch/optim/optimizer.py:76\u001b[0m, in \u001b[0;36m_use_grad_for_differentiable.._use_grad\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 74\u001b[0m torch\u001b[38;5;241m.\u001b[39mset_grad_enabled(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefaults[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdifferentiable\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[1;32m 75\u001b[0m torch\u001b[38;5;241m.\u001b[39m_dynamo\u001b[38;5;241m.\u001b[39mgraph_break()\n\u001b[0;32m---> 76\u001b[0m ret \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 77\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 78\u001b[0m torch\u001b[38;5;241m.\u001b[39m_dynamo\u001b[38;5;241m.\u001b[39mgraph_break()\n", "File \u001b[0;32m~/env/venv/lib/python3.10/site-packages/torch/optim/adamw.py:187\u001b[0m, in \u001b[0;36mAdamW.step\u001b[0;34m(self, closure)\u001b[0m\n\u001b[1;32m 174\u001b[0m beta1, beta2 \u001b[38;5;241m=\u001b[39m group[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbetas\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 176\u001b[0m has_complex \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_init_group(\n\u001b[1;32m 177\u001b[0m group,\n\u001b[1;32m 178\u001b[0m params_with_grad,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 184\u001b[0m state_steps,\n\u001b[1;32m 185\u001b[0m )\n\u001b[0;32m--> 187\u001b[0m \u001b[43madamw\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 188\u001b[0m \u001b[43m \u001b[49m\u001b[43mparams_with_grad\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 189\u001b[0m \u001b[43m \u001b[49m\u001b[43mgrads\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 190\u001b[0m \u001b[43m \u001b[49m\u001b[43mexp_avgs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 191\u001b[0m \u001b[43m \u001b[49m\u001b[43mexp_avg_sqs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 192\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_exp_avg_sqs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 193\u001b[0m \u001b[43m \u001b[49m\u001b[43mstate_steps\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 194\u001b[0m \u001b[43m \u001b[49m\u001b[43mamsgrad\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mamsgrad\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 195\u001b[0m \u001b[43m \u001b[49m\u001b[43mbeta1\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbeta1\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 196\u001b[0m \u001b[43m \u001b[49m\u001b[43mbeta2\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbeta2\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 197\u001b[0m \u001b[43m \u001b[49m\u001b[43mlr\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mlr\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 198\u001b[0m \u001b[43m \u001b[49m\u001b[43mweight_decay\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mweight_decay\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 199\u001b[0m \u001b[43m \u001b[49m\u001b[43meps\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43meps\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 200\u001b[0m \u001b[43m \u001b[49m\u001b[43mmaximize\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmaximize\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 201\u001b[0m \u001b[43m \u001b[49m\u001b[43mforeach\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mforeach\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 202\u001b[0m \u001b[43m \u001b[49m\u001b[43mcapturable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcapturable\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 203\u001b[0m \u001b[43m \u001b[49m\u001b[43mdifferentiable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdifferentiable\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 204\u001b[0m \u001b[43m \u001b[49m\u001b[43mfused\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfused\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 205\u001b[0m \u001b[43m \u001b[49m\u001b[43mgrad_scale\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mgrad_scale\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 206\u001b[0m \u001b[43m \u001b[49m\u001b[43mfound_inf\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfound_inf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 207\u001b[0m \u001b[43m \u001b[49m\u001b[43mhas_complex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhas_complex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 208\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 210\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m loss\n", "File \u001b[0;32m~/env/venv/lib/python3.10/site-packages/torch/optim/adamw.py:339\u001b[0m, in \u001b[0;36madamw\u001b[0;34m(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, foreach, capturable, differentiable, fused, grad_scale, found_inf, has_complex, amsgrad, beta1, beta2, lr, weight_decay, eps, maximize)\u001b[0m\n\u001b[1;32m 336\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 337\u001b[0m func \u001b[38;5;241m=\u001b[39m _single_tensor_adamw\n\u001b[0;32m--> 339\u001b[0m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 340\u001b[0m \u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 341\u001b[0m \u001b[43m \u001b[49m\u001b[43mgrads\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 342\u001b[0m \u001b[43m \u001b[49m\u001b[43mexp_avgs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 343\u001b[0m \u001b[43m \u001b[49m\u001b[43mexp_avg_sqs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 344\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_exp_avg_sqs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 345\u001b[0m \u001b[43m \u001b[49m\u001b[43mstate_steps\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 346\u001b[0m \u001b[43m \u001b[49m\u001b[43mamsgrad\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mamsgrad\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 347\u001b[0m \u001b[43m \u001b[49m\u001b[43mbeta1\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbeta1\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 348\u001b[0m \u001b[43m \u001b[49m\u001b[43mbeta2\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbeta2\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 349\u001b[0m \u001b[43m \u001b[49m\u001b[43mlr\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlr\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 350\u001b[0m \u001b[43m \u001b[49m\u001b[43mweight_decay\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mweight_decay\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 351\u001b[0m \u001b[43m \u001b[49m\u001b[43meps\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43meps\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 352\u001b[0m \u001b[43m \u001b[49m\u001b[43mmaximize\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmaximize\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 353\u001b[0m \u001b[43m \u001b[49m\u001b[43mcapturable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcapturable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 354\u001b[0m \u001b[43m \u001b[49m\u001b[43mdifferentiable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdifferentiable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 355\u001b[0m \u001b[43m \u001b[49m\u001b[43mgrad_scale\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgrad_scale\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 356\u001b[0m \u001b[43m \u001b[49m\u001b[43mfound_inf\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfound_inf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 357\u001b[0m \u001b[43m \u001b[49m\u001b[43mhas_complex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhas_complex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 358\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/env/venv/lib/python3.10/site-packages/torch/optim/adamw.py:552\u001b[0m, in \u001b[0;36m_multi_tensor_adamw\u001b[0;34m(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, grad_scale, found_inf, amsgrad, beta1, beta2, lr, weight_decay, eps, maximize, capturable, differentiable, has_complex)\u001b[0m\n\u001b[1;32m 549\u001b[0m torch\u001b[38;5;241m.\u001b[39m_foreach_lerp_(device_exp_avgs, device_grads, \u001b[38;5;241m1\u001b[39m \u001b[38;5;241m-\u001b[39m beta1)\n\u001b[1;32m 551\u001b[0m torch\u001b[38;5;241m.\u001b[39m_foreach_mul_(device_exp_avg_sqs, beta2)\n\u001b[0;32m--> 552\u001b[0m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_foreach_addcmul_\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdevice_exp_avg_sqs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevice_grads\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevice_grads\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mbeta2\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 554\u001b[0m \u001b[38;5;66;03m# Delete the local intermediate since it won't be used anymore to save on peak memory\u001b[39;00m\n\u001b[1;32m 555\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m device_grads\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)\n", "model.to(device)\n", "model.train()\n", "for epoch in range(5):\n", " print(\"Epoch:\", epoch)\n", " total_loss = 0.0\n", " total_correct = 0\n", " total_examples = 0\n", "\n", " for idx, batch in enumerate(train_loader):\n", " input_ids = batch.pop(\"input_ids\").to(device)\n", " pixel_values = batch.pop(\"pixel_values\").to(device)\n", " labels = input_ids\n", "\n", " outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels)\n", " loss = outputs.loss\n", " total_loss += loss.item()\n", "\n", " predictions = torch.argmax(outputs.logits, dim=-1)\n", " correct = (predictions == labels).sum().item()\n", " total_correct += correct\n", " total_examples += labels.numel()\n", "\n", " loss.backward()\n", " optimizer.step()\n", " optimizer.zero_grad()\n", "\n", " average_loss = total_loss / len(train_loader)\n", " accuracy = total_correct / total_examples\n", " print(f\"Average Loss for epoch {epoch}: {average_loss:.4f}\")\n", " print(f\"Accuracy for epoch {epoch}: {accuracy:.2f}\")" ] }, { "cell_type": "markdown", "metadata": { "id": "Dc4j-hLrgE6r" }, "source": [ "### Test loop" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "sMEMW6MiO0sS" }, "outputs": [], "source": [ "model.eval()\n", "with torch.no_grad():\n", " total_loss = 0.0\n", " total_correct = 0\n", " total_examples = 0\n", "\n", " for idx, batch in enumerate(test_loader):\n", " input_ids = batch.pop(\"input_ids\").to(device)\n", " pixel_values = batch.pop(\"pixel_values\").to(device)\n", " labels = input_ids\n", "\n", " outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels)\n", " loss = outputs.loss\n", " total_loss += loss.item()\n", "\n", " predictions = torch.argmax(outputs.logits, dim=-1)\n", " correct = (predictions == labels).sum().item()\n", " total_correct += correct\n", " total_examples += labels.numel()\n", "\n", " average_loss = total_loss / len(test_loader)\n", " accuracy = total_correct / total_examples\n", " print(f\"Test Average Loss: {average_loss:.4f}\")\n", " print(f\"Test Accuracy: {accuracy:.2f}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "qcKs5-3Jgz-M" }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ObYnoCzag0Aq" }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "rY6u33avg0CM" }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "8EZkrYFqg0E2" }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 10, "metadata": { "id": "qBmjfndHgzFj" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: huggingface_hub in /home/cassandra@myliser.lu/env/venv/lib/python3.10/site-packages (0.22.2)\n", "Requirement already satisfied: tqdm>=4.42.1 in /home/cassandra@myliser.lu/env/venv/lib/python3.10/site-packages (from huggingface_hub) (4.66.2)\n", "Requirement already satisfied: requests in /home/cassandra@myliser.lu/env/venv/lib/python3.10/site-packages (from huggingface_hub) (2.31.0)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /home/cassandra@myliser.lu/env/venv/lib/python3.10/site-packages (from huggingface_hub) (4.11.0)\n", "Requirement already satisfied: filelock in /home/cassandra@myliser.lu/env/venv/lib/python3.10/site-packages (from huggingface_hub) (3.13.4)\n", "Requirement already satisfied: fsspec>=2023.5.0 in /home/cassandra@myliser.lu/env/venv/lib/python3.10/site-packages (from huggingface_hub) (2024.3.1)\n", "Requirement already satisfied: pyyaml>=5.1 in /home/cassandra@myliser.lu/env/venv/lib/python3.10/site-packages (from huggingface_hub) (6.0.1)\n", "Requirement already satisfied: packaging>=20.9 in /home/cassandra@myliser.lu/env/venv/lib/python3.10/site-packages (from huggingface_hub) (24.0)\n", "Requirement already satisfied: certifi>=2017.4.17 in /home/cassandra@myliser.lu/env/venv/lib/python3.10/site-packages (from requests->huggingface_hub) (2024.2.2)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/cassandra@myliser.lu/env/venv/lib/python3.10/site-packages (from requests->huggingface_hub) (2.2.1)\n", "Requirement already satisfied: idna<4,>=2.5 in /home/cassandra@myliser.lu/env/venv/lib/python3.10/site-packages (from requests->huggingface_hub) (3.7)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /home/cassandra@myliser.lu/env/venv/lib/python3.10/site-packages (from requests->huggingface_hub) (3.3.2)\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "pip install huggingface_hub" ] }, { "cell_type": "markdown", "metadata": { "id": "ISBzxw0Igout" }, "source": [ "### Gradio webapp" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 337 }, "id": "tHSnxN7AZw8a", "outputId": "8fc49c5d-de24-4a57-e86d-2e63010b382d" }, "outputs": [ { "ename": "ModuleNotFoundError", "errorDetails": { "actions": [ { "action": "open_url", "actionText": "Open Examples", "url": "/notebooks/snippets/importing_libraries.ipynb" } ] }, "evalue": "No module named 'gradio'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mgradio\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mgr\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mgradio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcomponents\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mLabel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'gradio'", "", "\u001b[0;31m---------------------------------------------------------------------------\u001b[0;32m\nNOTE: If your import is failing due to a missing package, you can\nmanually install dependencies using either !pip or !apt.\n\nTo view examples of installing some common dependencies, click the\n\"Open Examples\" button below.\n\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n" ] } ], "source": [ "import gradio as gr\n", "from gradio.components import Label" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "eNDHwvGEad6n" }, "outputs": [], "source": [ "model.eval() # Mettez votre modèle en mode évaluation\n", "\n", "# Fonction d'inférence pour Gradio\n", "def predict(image):\n", " processor = AutoProcessor.from_pretrained(\"Salesforce/blip-image-captioning-base\")\n", " inputs = processor(images=image, return_tensors=\"pt\").to(device)\n", " pixel_values = inputs.pixel_values\n", "\n", " generated_ids = model.generate(pixel_values=pixel_values, max_length=50)\n", " generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]\n", "\n", "# Création de l'interface Gradio\n", "iface = gr.Interface(fn=predict,\n", " inputs=gr.components.Textbox(placeholder=\"Enter your text here...\"),\n", " outputs=gr.components.Label(num_top_classes=2))\n", "iface.launch(share=True)" ] } ], "metadata": { "accelerator": "GPU", "colab": { "gpuType": "T4", "provenance": [] }, "kernelspec": { "display_name": "venv", "language": "python", "name": "venv" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 4 }