add notebook and weights

Browse files

Files changed (8) hide show

blip.ipynb +493 -0
config.json +27 -0
generation_config.json +7 -0
model.safetensors +3 -0
preprocessor_config.json +24 -0
special_tokens_map.json +37 -0
tokenizer.json +0 -0
tokenizer_config.json +62 -0

blip.ipynb ADDED Viewed

	@@ -0,0 +1,493 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "p5S2GYrJe6lb"
+   },
+   "source": [
+    "# Image to text for Airbnb images"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "id": "lG3i-iiWe7l_"
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/[email protected]/env/venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "import torch\n",
+    "from torch.utils.data import Dataset\n",
+    "from PIL import Image\n",
+    "import pandas as pd\n",
+    "from transformers import AutoProcessor\n",
+    "import numpy as np\n",
+    "from torchvision import transforms\n",
+    "from transformers import BlipForConditionalGeneration\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "FpRt69nWfFFv"
+   },
+   "source": [
+    "### Create dataset with images and text and process them with BLIP's processor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "id": "1i4BMba0ln91"
+   },
+   "outputs": [],
+   "source": [
+    "class Airbnb(Dataset):\n",
+    "    def __init__(self, csv_file, data_augmentation):\n",
+    "        self.df = pd.read_csv(csv_file)\n",
+    "        self.processor = AutoProcessor.from_pretrained(\"Salesforce/blip-image-captioning-base\")\n",
+    "    def __len__(self):\n",
+    "        return self.df.shape[0]\n",
+    "\n",
+    "    def __getitem__(self, index):\n",
+    "        path_to_im = \"/home/[email protected]/image_to_text/blip/living_room/\" + str(self.df.listing_id_x[index])+ '_' + str(self.df.photo_number_x[index])\n",
+    "        image = Image.open(path_to_im).convert(\"RGB\")\n",
+    "        label = str(self.df.answers[index])\n",
+    "        encoding = self.processor(images=image, text=label, padding=\"max_length\", return_tensors=\"pt\")\n",
+    "        encoding = {k:v.squeeze() for k,v in encoding.items()}\n",
+    "        return encoding"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "e2sr84dsfXt7"
+   },
+   "source": [
+    "### Import CSV file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "id": "Zl0asqIYpp4-"
+   },
+   "outputs": [],
+   "source": [
+    "csv_file = \"/home/[email protected]/image_to_text/blip/Picture_Descriptions_All-Copy.csv\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "id": "8uUjuOj-qGsv"
+   },
+   "outputs": [],
+   "source": [
+    "dataset = Airbnb(csv_file, data_augmentation = None)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "0IK-kRFxfd3H"
+   },
+   "source": [
+    "### Split train/test dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "id": "93wmNMwgqwgg"
+   },
+   "outputs": [],
+   "source": [
+    "train_size = int(0.8 * len(dataset))\n",
+    "test_size = len(dataset) - train_size\n",
+    "train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "3VWdqSeWfhAN"
+   },
+   "source": [
+    "### Create dataloader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "id": "0pJdUuSTqy-5"
+   },
+   "outputs": [],
+   "source": [
+    "train_loader = torch.utils.data.DataLoader(\n",
+    "        train_dataset,\n",
+    "        batch_size=1,\n",
+    "        shuffle=True\n",
+    "    )\n",
+    "test_loader = torch.utils.data.DataLoader(\n",
+    "        test_dataset,\n",
+    "        batch_size=1,\n",
+    "        shuffle=True\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "mnwwxvB_fjlx"
+   },
+   "source": [
+    "### Import model and create device"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "id": "jY6h9kpgq0KX"
+   },
+   "outputs": [],
+   "source": [
+    "model = BlipForConditionalGeneration.from_pretrained(\"Salesforce/blip-image-captioning-base\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "id": "9rk60pCKfUkV"
+   },
+   "outputs": [],
+   "source": [
+    "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "HbiDQqzngCbn"
+   },
+   "source": [
+    "### Train loop"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "i39jlG5Aq1Yo",
+    "outputId": "a5292b17-f2b9-4a38-db0a-3f97d4923aa4"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch: 0\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.\n"
+     ]
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[9], line 25\u001b[0m\n\u001b[1;32m     22\u001b[0m     total_examples \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m labels\u001b[38;5;241m.\u001b[39mnumel()\n\u001b[1;32m     24\u001b[0m     loss\u001b[38;5;241m.\u001b[39mbackward()\n\u001b[0;32m---> 25\u001b[0m     \u001b[43moptimizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstep\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     26\u001b[0m     optimizer\u001b[38;5;241m.\u001b[39mzero_grad()\n\u001b[1;32m     28\u001b[0m average_loss \u001b[38;5;241m=\u001b[39m total_loss \u001b[38;5;241m/\u001b[39m \u001b[38;5;28mlen\u001b[39m(train_loader)\n",
+      "File \u001b[0;32m~/env/venv/lib/python3.10/site-packages/torch/optim/optimizer.py:385\u001b[0m, in \u001b[0;36mOptimizer.profile_hook_step.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    380\u001b[0m         \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    381\u001b[0m             \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[1;32m    382\u001b[0m                 \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfunc\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m must return None or a tuple of (new_args, new_kwargs), but got \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresult\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    383\u001b[0m             )\n\u001b[0;32m--> 385\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    386\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_optimizer_step_code()\n\u001b[1;32m    388\u001b[0m \u001b[38;5;66;03m# call optimizer step post hooks\u001b[39;00m\n",
+      "File \u001b[0;32m~/env/venv/lib/python3.10/site-packages/torch/optim/optimizer.py:76\u001b[0m, in \u001b[0;36m_use_grad_for_differentiable.<locals>._use_grad\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m     74\u001b[0m     torch\u001b[38;5;241m.\u001b[39mset_grad_enabled(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefaults[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdifferentiable\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[1;32m     75\u001b[0m     torch\u001b[38;5;241m.\u001b[39m_dynamo\u001b[38;5;241m.\u001b[39mgraph_break()\n\u001b[0;32m---> 76\u001b[0m     ret \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     77\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m     78\u001b[0m     torch\u001b[38;5;241m.\u001b[39m_dynamo\u001b[38;5;241m.\u001b[39mgraph_break()\n",
+      "File \u001b[0;32m~/env/venv/lib/python3.10/site-packages/torch/optim/adamw.py:187\u001b[0m, in \u001b[0;36mAdamW.step\u001b[0;34m(self, closure)\u001b[0m\n\u001b[1;32m    174\u001b[0m     beta1, beta2 \u001b[38;5;241m=\u001b[39m group[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbetas\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m    176\u001b[0m     has_complex \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_init_group(\n\u001b[1;32m    177\u001b[0m         group,\n\u001b[1;32m    178\u001b[0m         params_with_grad,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    184\u001b[0m         state_steps,\n\u001b[1;32m    185\u001b[0m     )\n\u001b[0;32m--> 187\u001b[0m     \u001b[43madamw\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    188\u001b[0m \u001b[43m        \u001b[49m\u001b[43mparams_with_grad\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    189\u001b[0m \u001b[43m        \u001b[49m\u001b[43mgrads\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    190\u001b[0m \u001b[43m        \u001b[49m\u001b[43mexp_avgs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    191\u001b[0m \u001b[43m        \u001b[49m\u001b[43mexp_avg_sqs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    192\u001b[0m \u001b[43m        \u001b[49m\u001b[43mmax_exp_avg_sqs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    193\u001b[0m \u001b[43m        \u001b[49m\u001b[43mstate_steps\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    194\u001b[0m \u001b[43m        \u001b[49m\u001b[43mamsgrad\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mamsgrad\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    195\u001b[0m \u001b[43m        \u001b[49m\u001b[43mbeta1\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbeta1\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    196\u001b[0m \u001b[43m        \u001b[49m\u001b[43mbeta2\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbeta2\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    197\u001b[0m \u001b[43m        \u001b[49m\u001b[43mlr\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mlr\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    198\u001b[0m \u001b[43m        \u001b[49m\u001b[43mweight_decay\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mweight_decay\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    199\u001b[0m \u001b[43m        \u001b[49m\u001b[43meps\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43meps\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    200\u001b[0m \u001b[43m        \u001b[49m\u001b[43mmaximize\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmaximize\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    201\u001b[0m \u001b[43m        \u001b[49m\u001b[43mforeach\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mforeach\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    202\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcapturable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcapturable\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    203\u001b[0m \u001b[43m        \u001b[49m\u001b[43mdifferentiable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdifferentiable\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    204\u001b[0m \u001b[43m        \u001b[49m\u001b[43mfused\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfused\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    205\u001b[0m \u001b[43m        \u001b[49m\u001b[43mgrad_scale\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mgrad_scale\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    206\u001b[0m \u001b[43m        \u001b[49m\u001b[43mfound_inf\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfound_inf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    207\u001b[0m \u001b[43m        \u001b[49m\u001b[43mhas_complex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhas_complex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    208\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    210\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m loss\n",
+      "File \u001b[0;32m~/env/venv/lib/python3.10/site-packages/torch/optim/adamw.py:339\u001b[0m, in \u001b[0;36madamw\u001b[0;34m(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, foreach, capturable, differentiable, fused, grad_scale, found_inf, has_complex, amsgrad, beta1, beta2, lr, weight_decay, eps, maximize)\u001b[0m\n\u001b[1;32m    336\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    337\u001b[0m     func \u001b[38;5;241m=\u001b[39m _single_tensor_adamw\n\u001b[0;32m--> 339\u001b[0m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    340\u001b[0m \u001b[43m    \u001b[49m\u001b[43mparams\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    341\u001b[0m \u001b[43m    \u001b[49m\u001b[43mgrads\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    342\u001b[0m \u001b[43m    \u001b[49m\u001b[43mexp_avgs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    343\u001b[0m \u001b[43m    \u001b[49m\u001b[43mexp_avg_sqs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    344\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmax_exp_avg_sqs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    345\u001b[0m \u001b[43m    \u001b[49m\u001b[43mstate_steps\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    346\u001b[0m \u001b[43m    \u001b[49m\u001b[43mamsgrad\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mamsgrad\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    347\u001b[0m \u001b[43m    \u001b[49m\u001b[43mbeta1\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbeta1\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    348\u001b[0m \u001b[43m    \u001b[49m\u001b[43mbeta2\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbeta2\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    349\u001b[0m \u001b[43m    \u001b[49m\u001b[43mlr\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlr\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    350\u001b[0m \u001b[43m    \u001b[49m\u001b[43mweight_decay\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mweight_decay\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    351\u001b[0m \u001b[43m    \u001b[49m\u001b[43meps\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43meps\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    352\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmaximize\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmaximize\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    353\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcapturable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcapturable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    354\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdifferentiable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdifferentiable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    355\u001b[0m \u001b[43m    \u001b[49m\u001b[43mgrad_scale\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgrad_scale\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    356\u001b[0m \u001b[43m    \u001b[49m\u001b[43mfound_inf\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfound_inf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    357\u001b[0m \u001b[43m    \u001b[49m\u001b[43mhas_complex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhas_complex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    358\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/env/venv/lib/python3.10/site-packages/torch/optim/adamw.py:552\u001b[0m, in \u001b[0;36m_multi_tensor_adamw\u001b[0;34m(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, grad_scale, found_inf, amsgrad, beta1, beta2, lr, weight_decay, eps, maximize, capturable, differentiable, has_complex)\u001b[0m\n\u001b[1;32m    549\u001b[0m torch\u001b[38;5;241m.\u001b[39m_foreach_lerp_(device_exp_avgs, device_grads, \u001b[38;5;241m1\u001b[39m \u001b[38;5;241m-\u001b[39m beta1)\n\u001b[1;32m    551\u001b[0m torch\u001b[38;5;241m.\u001b[39m_foreach_mul_(device_exp_avg_sqs, beta2)\n\u001b[0;32m--> 552\u001b[0m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_foreach_addcmul_\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdevice_exp_avg_sqs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevice_grads\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevice_grads\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mbeta2\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    554\u001b[0m \u001b[38;5;66;03m# Delete the local intermediate since it won't be used anymore to save on peak memory\u001b[39;00m\n\u001b[1;32m    555\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m device_grads\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)\n",
+    "model.to(device)\n",
+    "model.train()\n",
+    "for epoch in range(5):\n",
+    "    print(\"Epoch:\", epoch)\n",
+    "    total_loss = 0.0\n",
+    "    total_correct = 0\n",
+    "    total_examples = 0\n",
+    "\n",
+    "    for idx, batch in enumerate(train_loader):\n",
+    "        input_ids = batch.pop(\"input_ids\").to(device)\n",
+    "        pixel_values = batch.pop(\"pixel_values\").to(device)\n",
+    "        labels = input_ids\n",
+    "\n",
+    "        outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels)\n",
+    "        loss = outputs.loss\n",
+    "        total_loss += loss.item()\n",
+    "\n",
+    "        predictions = torch.argmax(outputs.logits, dim=-1)\n",
+    "        correct = (predictions == labels).sum().item()\n",
+    "        total_correct += correct\n",
+    "        total_examples += labels.numel()\n",
+    "\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "        optimizer.zero_grad()\n",
+    "\n",
+    "    average_loss = total_loss / len(train_loader)\n",
+    "    accuracy = total_correct / total_examples\n",
+    "    print(f\"Average Loss for epoch {epoch}: {average_loss:.4f}\")\n",
+    "    print(f\"Accuracy for epoch {epoch}: {accuracy:.2f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Dc4j-hLrgE6r"
+   },
+   "source": [
+    "### Test loop"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "sMEMW6MiO0sS"
+   },
+   "outputs": [],
+   "source": [
+    "model.eval()\n",
+    "with torch.no_grad():\n",
+    "    total_loss = 0.0\n",
+    "    total_correct = 0\n",
+    "    total_examples = 0\n",
+    "\n",
+    "    for idx, batch in enumerate(test_loader):\n",
+    "        input_ids = batch.pop(\"input_ids\").to(device)\n",
+    "        pixel_values = batch.pop(\"pixel_values\").to(device)\n",
+    "        labels = input_ids\n",
+    "\n",
+    "        outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels)\n",
+    "        loss = outputs.loss\n",
+    "        total_loss += loss.item()\n",
+    "\n",
+    "        predictions = torch.argmax(outputs.logits, dim=-1)\n",
+    "        correct = (predictions == labels).sum().item()\n",
+    "        total_correct += correct\n",
+    "        total_examples += labels.numel()\n",
+    "\n",
+    "    average_loss = total_loss / len(test_loader)\n",
+    "    accuracy = total_correct / total_examples\n",
+    "    print(f\"Test Average Loss: {average_loss:.4f}\")\n",
+    "    print(f\"Test Accuracy: {accuracy:.2f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "qcKs5-3Jgz-M"
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "ObYnoCzag0Aq"
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "rY6u33avg0CM"
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "8EZkrYFqg0E2"
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "id": "qBmjfndHgzFj"
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: huggingface_hub in /home/[email protected]/env/venv/lib/python3.10/site-packages (0.22.2)\n",
+      "Requirement already satisfied: tqdm>=4.42.1 in /home/[email protected]/env/venv/lib/python3.10/site-packages (from huggingface_hub) (4.66.2)\n",
+      "Requirement already satisfied: requests in /home/[email protected]/env/venv/lib/python3.10/site-packages (from huggingface_hub) (2.31.0)\n",
+      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /home/[email protected]/env/venv/lib/python3.10/site-packages (from huggingface_hub) (4.11.0)\n",
+      "Requirement already satisfied: filelock in /home/[email protected]/env/venv/lib/python3.10/site-packages (from huggingface_hub) (3.13.4)\n",
+      "Requirement already satisfied: fsspec>=2023.5.0 in /home/[email protected]/env/venv/lib/python3.10/site-packages (from huggingface_hub) (2024.3.1)\n",
+      "Requirement already satisfied: pyyaml>=5.1 in /home/[email protected]/env/venv/lib/python3.10/site-packages (from huggingface_hub) (6.0.1)\n",
+      "Requirement already satisfied: packaging>=20.9 in /home/[email protected]/env/venv/lib/python3.10/site-packages (from huggingface_hub) (24.0)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /home/[email protected]/env/venv/lib/python3.10/site-packages (from requests->huggingface_hub) (2024.2.2)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/[email protected]/env/venv/lib/python3.10/site-packages (from requests->huggingface_hub) (2.2.1)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /home/[email protected]/env/venv/lib/python3.10/site-packages (from requests->huggingface_hub) (3.7)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /home/[email protected]/env/venv/lib/python3.10/site-packages (from requests->huggingface_hub) (3.3.2)\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "pip install huggingface_hub"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "ISBzxw0Igout"
+   },
+   "source": [
+    "### Gradio webapp"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 337
+    },
+    "id": "tHSnxN7AZw8a",
+    "outputId": "8fc49c5d-de24-4a57-e86d-2e63010b382d"
+   },
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "errorDetails": {
+      "actions": [
+       {
+        "action": "open_url",
+        "actionText": "Open Examples",
+        "url": "/notebooks/snippets/importing_libraries.ipynb"
+       }
+      ]
+     },
+     "evalue": "No module named 'gradio'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-38-c71c84f2e5e0>\u001b[0m in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mgradio\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mgr\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mgradio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcomponents\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mLabel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'gradio'",
+      "",
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0;32m\nNOTE: If your import is failing due to a missing package, you can\nmanually install dependencies using either !pip or !apt.\n\nTo view examples of installing some common dependencies, click the\n\"Open Examples\" button below.\n\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "import gradio as gr\n",
+    "from gradio.components import Label"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "eNDHwvGEad6n"
+   },
+   "outputs": [],
+   "source": [
+    "model.eval()  # Mettez votre modèle en mode évaluation\n",
+    "\n",
+    "# Fonction d'inférence pour Gradio\n",
+    "def predict(image):\n",
+    "  processor = AutoProcessor.from_pretrained(\"Salesforce/blip-image-captioning-base\")\n",
+    "  inputs = processor(images=image, return_tensors=\"pt\").to(device)\n",
+    "  pixel_values = inputs.pixel_values\n",
+    "\n",
+    "  generated_ids = model.generate(pixel_values=pixel_values, max_length=50)\n",
+    "  generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]\n",
+    "\n",
+    "# Création de l'interface Gradio\n",
+    "iface = gr.Interface(fn=predict,\n",
+    "                     inputs=gr.components.Textbox(placeholder=\"Enter your text here...\"),\n",
+    "                     outputs=gr.components.Label(num_top_classes=2))\n",
+    "iface.launch(share=True)"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "venv"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "_name_or_path": "Salesforce/blip-image-captioning-base",
+  "architectures": [
+    "BlipForConditionalGeneration"
+  ],
+  "image_text_hidden_size": 256,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "label_smoothing": 0.0,
+  "logit_scale_init_value": 2.6592,
+  "model_type": "blip",
+  "projection_dim": 512,
+  "text_config": {
+    "initializer_factor": 1.0,
+    "model_type": "blip_text_model",
+    "num_attention_heads": 12
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.38.2",
+  "vision_config": {
+    "dropout": 0.0,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "model_type": "blip_vision_model",
+    "num_channels": 3
+  }
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 30522,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.38.2"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:393dfdb97bb82ebafd725764c989a0fbf37086428ebdec3b5625c8bd4916e412
+size 989717056

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "BlipImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "processor_class": "BlipProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 384,
+    "width": 384
+  }
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,62 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 512,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "processor_class": "BlipProcessor",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}