Spaces:

Atulit23
/

ColPali

Sleeping

File size: 7,930 Bytes

738e435

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Implementing Colpali with Qwen2VL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\atuli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Verbosity is set to 1 (active). Pass verbose=0 to make quieter.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.\n",
      "Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use\n",
      "`config.hidden_activation` if you want to override this behaviour.\n",
      "See https://github.com/huggingface/transformers/pull/29402 for more details.\n",
      "Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  6.01it/s]\n"
     ]
    }
   ],
   "source": [
    "from byaldi import RAGMultiModalModel\n",
    "\n",
    "RAG = RAGMultiModalModel.from_pretrained(\"vidore/colpali\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.\n",
      "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Added page 1 of document 0 to index.\n",
      "Index exported to .byaldi\\image_index\n",
      "Index exported to .byaldi\\image_index\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{0: 'image.png'}"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "RAG.index(\n",
    "    input_path=\"image.png\",\n",
    "    index_name=\"image_index\",\n",
    "    store_collection_with_index=False,\n",
    "    overwrite=True\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[{'doc_id': 0, 'page_num': 1, 'score': 18.75, 'metadata': {}, 'base64': None}]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text_query = \"What is the structure of the compiler?\"\n",
    "results = RAG.search(text_query, k=1)\n",
    "results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.\n",
      "Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}\n",
      "Loading checkpoint shards: 100%|██████████| 2/2 [00:13<00:00,  6.88s/it]\n"
     ]
    }
   ],
   "source": [
    "from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor\n",
    "from qwen_vl_utils import process_vision_info\n",
    "import torch\n",
    "\n",
    "model = Qwen2VLForConditionalGeneration.from_pretrained(\n",
    "        \"Qwen/Qwen2-VL-2B-Instruct\",\n",
    "        trust_remote_code=True,\n",
    "        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "results[0][\"page_num\"] -1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from PIL import Image\n",
    "processor = AutoProcessor.from_pretrained(\"Qwen/Qwen2-VL-2B-Instruct\", trust_remote_code=True)\n",
    "\n",
    "messages = [\n",
    "    {\n",
    "        \"role\": \"user\",\n",
    "        \"content\": [\n",
    "            {\n",
    "                \"type\": \"image\",\n",
    "                \"image\": Image.open(\"image.png\"),\n",
    "            },\n",
    "            {\"type\": \"text\", \"text\": text_query},\n",
    "        ],\n",
    "    }\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "text = processor.apply_chat_template(\n",
    "    messages, tokenize=False, add_generation_prompt=True\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "image_inputs, video_inputs = process_vision_info(messages)\n",
    "inputs = processor(\n",
    "    text=[text],\n",
    "    images=image_inputs,\n",
    "    videos=video_inputs,\n",
    "    padding=True,\n",
    "    return_tensors=\"pt\",\n",
    ")\n",
    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
    "inputs = inputs.to(device)\n",
    "model = model.to(device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "generated_ids = model.generate(**inputs, max_new_tokens=50)\n",
    "generated_ids_trimmed = [\n",
    "    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n",
    "]\n",
    "output_text = processor.batch_decode(\n",
    "    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['The structure of the compiler, as described in the syllabus, includes the following components:\\n\\n1. **Lexical Analysis**: This involves the role of the lexical analyzer, input buffering, and the design of lexical analyzers, specification and recognition of tokens']\n"
     ]
    }
   ],
   "source": [
    "print(output_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}