{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Implementing Colpali with Qwen2VL" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\atuli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Verbosity is set to 1 (active). Pass verbose=0 to make quieter.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.\n", "Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use\n", "`config.hidden_activation` if you want to override this behaviour.\n", "See https://github.com/huggingface/transformers/pull/29402 for more details.\n", "Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 6.01it/s]\n" ] } ], "source": [ "from byaldi import RAGMultiModalModel\n", "\n", "RAG = RAGMultiModalModel.from_pretrained(\"vidore/colpali\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `` tokens in the very beginning of your text and `` token after that. For this call, we will infer how many images each text has and add special tokens.\n", "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Added page 1 of document 0 to index.\n", "Index exported to .byaldi\\image_index\n", "Index exported to .byaldi\\image_index\n" ] }, { "data": { "text/plain": [ "{0: 'image.png'}" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "RAG.index(\n", " input_path=\"image.png\",\n", " index_name=\"image_index\",\n", " store_collection_with_index=False,\n", " overwrite=True\n", ")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `` tokens in the very beginning of your text and `` token after that. For this call, we will infer how many images each text has and add special tokens.\n" ] }, { "data": { "text/plain": [ "[{'doc_id': 0, 'page_num': 1, 'score': 18.75, 'metadata': {}, 'base64': None}]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "text_query = \"What is the structure of the compiler?\"\n", "results = RAG.search(text_query, k=1)\n", "results" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.\n", "Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}\n", "Loading checkpoint shards: 100%|██████████| 2/2 [00:13<00:00, 6.88s/it]\n" ] } ], "source": [ "from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor\n", "from qwen_vl_utils import process_vision_info\n", "import torch\n", "\n", "model = Qwen2VLForConditionalGeneration.from_pretrained(\n", " \"Qwen/Qwen2-VL-2B-Instruct\",\n", " trust_remote_code=True,\n", " torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32\n", " )" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "results[0][\"page_num\"] -1" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "from PIL import Image\n", "processor = AutoProcessor.from_pretrained(\"Qwen/Qwen2-VL-2B-Instruct\", trust_remote_code=True)\n", "\n", "messages = [\n", " {\n", " \"role\": \"user\",\n", " \"content\": [\n", " {\n", " \"type\": \"image\",\n", " \"image\": Image.open(\"image.png\"),\n", " },\n", " {\"type\": \"text\", \"text\": text_query},\n", " ],\n", " }\n", "]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "text = processor.apply_chat_template(\n", " messages, tokenize=False, add_generation_prompt=True\n", ")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "image_inputs, video_inputs = process_vision_info(messages)\n", "inputs = processor(\n", " text=[text],\n", " images=image_inputs,\n", " videos=video_inputs,\n", " padding=True,\n", " return_tensors=\"pt\",\n", ")\n", "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", "inputs = inputs.to(device)\n", "model = model.to(device)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "generated_ids = model.generate(**inputs, max_new_tokens=50)\n", "generated_ids_trimmed = [\n", " out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n", "]\n", "output_text = processor.batch_decode(\n", " generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n", ")\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['The structure of the compiler, as described in the syllabus, includes the following components:\\n\\n1. **Lexical Analysis**: This involves the role of the lexical analyzer, input buffering, and the design of lexical analyzers, specification and recognition of tokens']\n" ] } ], "source": [ "print(output_text)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.11" } }, "nbformat": 4, "nbformat_minor": 2 }