|
from typing import ClassVar, List, Optional |
|
|
|
import torch |
|
from torch import nn |
|
from transformers.models.qwen2_vl import Qwen2VLConfig, Qwen2VLForConditionalGeneration |
|
|
|
|
|
class ColQwen2(Qwen2VLForConditionalGeneration): |
|
""" |
|
ColQwen2 model implementation from the "ColPali: Efficient Document Retrieval with Vision Language Models" paper. |
|
""" |
|
|
|
main_input_name: ClassVar[str] = "doc_input_ids" |
|
|
|
def __init__(self, config: Qwen2VLConfig): |
|
super().__init__(config=config) |
|
self.dim = 128 |
|
self.custom_text_proj = nn.Linear(self.model.config.hidden_size, self.dim) |
|
self.padding_side = "left" |
|
self.post_init() |
|
|
|
|
|
def inner_forward( |
|
self, |
|
input_ids: torch.LongTensor = None, |
|
attention_mask: Optional[torch.Tensor] = None, |
|
position_ids: Optional[torch.LongTensor] = None, |
|
past_key_values: Optional[List[torch.FloatTensor]] = None, |
|
inputs_embeds: Optional[torch.FloatTensor] = None, |
|
use_cache: Optional[bool] = None, |
|
output_attentions: Optional[bool] = None, |
|
output_hidden_states: Optional[bool] = None, |
|
return_dict: Optional[bool] = None, |
|
pixel_values: Optional[torch.Tensor] = None, |
|
pixel_values_videos: Optional[torch.FloatTensor] = None, |
|
image_grid_thw: Optional[torch.LongTensor] = None, |
|
video_grid_thw: Optional[torch.LongTensor] = None, |
|
) -> torch.Tensor: |
|
|
|
if inputs_embeds is None: |
|
inputs_embeds = self.model.embed_tokens(input_ids) |
|
if pixel_values is not None: |
|
pixel_values = pixel_values.type(self.visual.get_dtype()) |
|
image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw) |
|
image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1).expand_as(inputs_embeds) |
|
image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype) |
|
inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds) |
|
|
|
if pixel_values_videos is not None: |
|
pixel_values_videos = pixel_values_videos.type(self.visual.get_dtype()) |
|
video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw) |
|
video_mask = (input_ids == self.config.video_token_id).unsqueeze(-1).expand_as(inputs_embeds) |
|
video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype) |
|
inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds) |
|
|
|
if attention_mask is not None: |
|
attention_mask = attention_mask.to(inputs_embeds.device) |
|
|
|
outputs = self.model( |
|
input_ids=None, |
|
position_ids=position_ids, |
|
attention_mask=attention_mask, |
|
past_key_values=past_key_values, |
|
inputs_embeds=inputs_embeds, |
|
use_cache=use_cache, |
|
output_attentions=output_attentions, |
|
output_hidden_states=output_hidden_states, |
|
return_dict=return_dict, |
|
) |
|
|
|
hidden_states = outputs[0] |
|
return hidden_states |
|
|
|
|
|
|
|
def forward(self, *args, **kwargs) -> torch.Tensor: |
|
|
|
kwargs.pop("output_hidden_states", None) |
|
|
|
|
|
if "pixel_values" in kwargs: |
|
|
|
offsets = kwargs["image_grid_thw"][:, 1] * kwargs["image_grid_thw"][:, 2] |
|
kwargs["pixel_values"] = torch.cat( |
|
[pv[:o] for pv, o in zip(kwargs["pixel_values"], offsets)], |
|
dim=0, |
|
) |
|
|
|
position_ids, rope_deltas = self.get_rope_index( |
|
input_ids=kwargs["input_ids"], |
|
image_grid_thw=kwargs.get("image_grid_thw", None), |
|
video_grid_thw=None, |
|
attention_mask=kwargs.get("attention_mask", None), |
|
) |
|
|
|
last_hidden_states = self.inner_forward(*args, |
|
**kwargs, |
|
position_ids=position_ids, |
|
use_cache=False, |
|
output_hidden_states=True) |
|
|
|
proj = self.custom_text_proj(last_hidden_states) |
|
|
|
|
|
proj = proj / proj.norm(dim=-1, keepdim=True) |
|
proj = proj * kwargs["attention_mask"].unsqueeze(-1) |
|
return proj |
|
|