v1-7B / grounding.py

Support Transformers AutoModel

b7d5f5c 2 months ago

21.2 kB

	from typing import Tuple, List, Optional, Union
	import re
	import math

	from PIL import Image
	import numpy as np
	import torch
	import torch.nn.functional as F
	from qwen_vl_utils import process_vision_info
	from transformers.feature_extraction_utils import BatchFeature
	from transformers.image_utils import ImageInput, VideoInput
	from transformers.processing_utils import (
	Unpack,
	)
	from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
	from transformers.models.qwen2_vl.image_processing_qwen2_vl import (
	smart_resize,
	Qwen2VLImageProcessor,
	)
	from transformers.models.qwen2_5_vl.processing_qwen2_5_vl import (
	Qwen2_5_VLProcessorKwargs,
	Qwen2_5_VLProcessor,
	)


	"""
	Qwen2.5-VL does not use AnyRes to my relief.
	Things to take into account:
	- smart_resize
	- temporal dimension
	- grid_t = patches.shape[0] // self.temporal_patch_size
	- grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
	- merge_size (2)


	Usage:

	model_name = "Qwen/Qwen2.5-VL-7B-Instruct"


	processor = Qwen2_5_VLPointerProcessor.from_pretrained(model_name)
	processor.image_processor = Qwen2VLImagePointerProcessor.from_pretrained(model_name)

	messages = [
	{
	"role": "user",
	"content": [
	{
	"type": "image",
	"image": "https://example---/demo.jpeg",
	},
	{"type": "text", "text": "Describe this image."},
	],
	},
	{
	'role': 'assistant',
	'content': [
	{
	'type': 'text', 'text': '<think>Theres a cat at <\|region\|>, a dog at <\|region\|>.</think>A calico cat hanging out with a golden retriever.'
	}
	]
	}
	]

	# Preparation for inference
	text = processor.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)
	regions = [
	[0, 10, 100, 200],
	[300, 0, 600, 250]
	]
	image_inputs, video_inputs = process_vision_info(messages)
	inputs = processor(
	text=[text],
	images=image_inputs,
	videos=video_inputs,
	regions=[regions]
	padding=True,
	return_tensors="pt",
	)
	inputs = inputs.to("cuda")


	# Qwen2VLImageProcessor in a nutshell
	'(tl tp) c (hlm hm hp) (wlm wm wp) -> (tl hlm wlm hm wm) (c tp hp wp)'
	"""


	BBOX = Tuple[int, int, int, int]


	class PointerProcessor:
	@staticmethod
	def normalize_bbox(image_size: Tuple[int, int], bbox: BBOX):
	w, h = image_size
	bbox = [
	bbox[0] / w,
	bbox[1] / h,
	bbox[2] / w,
	bbox[3] / h,
	]
	return "[{}]".format(", ".join([f"{v:.2f}" for v in bbox]))

	def get_masks(self, image_size: Tuple[int, int], indices: List[int]):
	width, height = image_size
	resized_height, resized_width = smart_resize(
	height,
	width,
	factor=self.patch_size * self.merge_size,
	min_pixels=self.min_pixels,
	max_pixels=self.max_pixels,
	)

	# grid_h = resized_height // self.patch_size // self.merge_size
	grid_w_m = resized_width // self.patch_size // self.merge_size

	mask = torch.zeros(resized_height, resized_width)
	for index in indices:
	index_h = index // grid_w_m
	index_w = index % grid_w_m
	bbox = (
	max(index_w * self.patch_size * self.merge_size, 0),
	max(index_h * self.patch_size * self.merge_size, 0),
	min((index_w + 1) * self.patch_size * self.merge_size, resized_width),
	min((index_h + 1) * self.patch_size * self.merge_size, resized_height),
	)
	x1, y1, x2, y2 = bbox
	mask[y1:y2, x1:x2] = 1
	# mask = mask.t() # to width, height
	return mask, (resized_width, resized_height)

	def get_patch_pointers(
	self, image_size: Tuple[int, int], region: Union[BBOX, np.ndarray]
	):
	if isinstance(region, np.ndarray):
	return self.get_mask_patch_pointers(image_size, region)
	else:
	return self.get_bbox_patch_pointers(image_size, region)

	def get_bbox_patch_pointers(self, image_size: Tuple[int, int], bbox: BBOX):
	factor = self.merge_size
	# factor = 1
	width, height = image_size
	resized_height, resized_width = smart_resize(
	height,
	width,
	factor=self.patch_size * self.merge_size,
	min_pixels=self.min_pixels,
	max_pixels=self.max_pixels,
	)
	x0, y0, x1, y1 = bbox
	resized_bbox = [
	max(x0 / width * resized_width, 0),
	max(y0 / height * resized_height, 0),
	min(x1 / width * resized_width, resized_width),
	min(y1 / height * resized_height, resized_height),
	]
	# patch_bbox = [v / self.patch_size / self.merge_size for v in resized_bbox]
	patch_bbox = [v / self.patch_size / factor for v in resized_bbox]
	x0, y0, x1, y1 = patch_bbox
	boundaries = [
	math.floor(x0),
	math.floor(y0),
	math.ceil(x1),
	math.ceil(y1),
	]
	x0, y0, x1, y1 = boundaries

	# t, h, w
	grid_w = resized_width // self.patch_size
	grid_w_m = grid_w // factor
	rows, cols = np.meshgrid(np.arange(y0, y1), np.arange(x0, x1), indexing="ij")
	grid_indices = np.column_stack((rows.ravel(), cols.ravel()))
	indices = grid_indices[:, 0] * grid_w_m + grid_indices[:, 1]
	base_ids = list(indices)
	# reorder
	# t, hl, wl, hm, wm
	# ids_map = torch.arange(grid_h * grid_w).reshape(grid_h, grid_w)
	# ids_map = rearrange(
	# ids_map,
	# "(hl hm) (wl wm) -> (hl wl) (hm wm)",
	# hm=self.merge_size,
	# wm=self.merge_size,
	# ).reshape(-1)
	# inv_map = ids_map.argsort()
	# ids = inv_map[base_ids].numpy()
	ids = np.array(base_ids)
	# ids.sort()
	return ids

	def get_mask_patch_pointers(self, image_size: Tuple[int, int], mask: np.ndarray):
	# mask size: w h
	width, height = image_size
	resized_height, resized_width = smart_resize(
	height,
	width,
	factor=self.patch_size * self.merge_size,
	min_pixels=self.min_pixels,
	max_pixels=self.max_pixels,
	)
	grid_w_m = resized_width // self.patch_size // self.merge_size
	grid_h_m = resized_height // self.patch_size // self.merge_size

	m = torch.from_numpy(mask).float()
	m = F.interpolate(
	m[None, None], (grid_h_m, grid_w_m), mode="bilinear", antialias="bilinear"
	)[0, 0]
	# m = m > 0 # upper bound

	grid_indices = m.nonzero(as_tuple=False)
	indices = grid_indices[:, 0] * grid_w_m + grid_indices[:, 1]
	ids = indices.numpy()
	return ids

	def renormalize(self, tensor):
	# crude - non-accurate implementation for the lazy
	mean = np.array(self.image_mean).mean()
	std = np.array(self.image_std).mean()
	return tensor * std + mean


	class Qwen2VLImagePointerProcessor(Qwen2VLImageProcessor, PointerProcessor):
	pass


	class Qwen2_5_VLPointerProcessor(Qwen2_5_VLProcessor):
	image_processor_class = "Qwen2VLImagePointerProcessor"

	def __init__(
	self,
	image_processor=None,
	tokenizer=None,
	chat_template=None,
	prepend_raw_region_to_text: bool = True,
	**kwargs,
	):
	super().__init__(
	image_processor=image_processor,
	tokenizer=tokenizer,
	chat_template=chat_template,
	**kwargs,
	)

	self.region_token = "<\|region\|>"
	self.copy_token_start = None
	self.prepend_raw_region_to_text = prepend_raw_region_to_text

	def extract_masks(self, image_size: Tuple[int, int], text: str):
	# first, gather region indices from text
	region_pattern = re.compile(r"<region>(.*?)</region>")
	regions = region_pattern.findall(text)

	indices = []
	copy_pattern = re.compile(r"<\\|copy_(\d+)\\|>")

	for region in regions:
	# Extract all numbers inside <\|copy_X\|> tags within the region
	numbers = [int(match) for match in copy_pattern.findall(region)]
	indices.append(numbers)

	# Then, convert region indices into masks
	masks = []
	resized_image_size = image_size
	for region in indices:
	mask, resized_image_size = self.image_processor.get_masks(
	image_size, region
	)
	masks.append(mask)
	return masks, resized_image_size

	def __call__(
	self,
	images: ImageInput = None,
	text: Union[
	TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]
	] = None,
	videos: VideoInput = None,
	regions: Optional[List[Union[BBOX, np.ndarray]]] = None,
	**kwargs: Unpack[Qwen2_5_VLProcessorKwargs],
	) -> BatchFeature:
	"""
	Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
	and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
	the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
	Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.

	Args:
	images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
	The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
	tensor. Both channels-first and channels-last formats are supported.
	text (`str`, `List[str]`, `List[List[str]]`):
	The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
	(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
	`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
	videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
	The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
	tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
	regions:
	either bboxes: List[Tuple[int, int, int, int]]
	or masks: List[np.ndarray[width, height]]
	return_tensors (`str` or [`~utils.TensorType`], optional):
	If set, will return tensors of a particular framework. Acceptable values are:
	- `'tf'`: Return TensorFlow `tf.constant` objects.
	- `'pt'`: Return PyTorch `torch.Tensor` objects.
	- `'np'`: Return NumPy `np.ndarray` objects.
	- `'jax'`: Return JAX `jnp.ndarray` objects.

	Returns:
	[`BatchFeature`]: A [`BatchFeature`] with the following fields:

	- input_ids -- List of token ids to be fed to a model. Returned when `text` is not `None`.
	- attention_mask -- List of indices specifying which tokens should be attended to by the model (when
	`return_attention_mask=True` or if "attention_mask" is in `self.model_input_names` and if `text` is not
	`None`).
	- pixel_values -- Pixel values to be fed to a model. Returned when `images` is not `None`.
	- pixel_values_videos -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
	- image_grid_thw -- List of image 3D grid in LLM. Returned when `images` is not `None`.
	- video_grid_thw -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
	- second_per_grid_ts -- List of video seconds per time grid. Returned when `videos` is not `None`.
	"""

	output_kwargs = self._merge_kwargs(
	Qwen2_5_VLProcessorKwargs,
	tokenizer_init_kwargs=self.tokenizer.init_kwargs,
	**kwargs,
	)
	obj_ptrs = None
	if images is not None:
	image_inputs = self.image_processor(
	images=images, videos=None, **output_kwargs["images_kwargs"]
	)
	image_grid_thw = image_inputs["image_grid_thw"]

	for image in images:
	assert isinstance(
	image, Image.Image
	), "only supporting a single image per row for now"

	if regions is not None:
	obj_ptrs = [
	[
	(
	self.image_processor.get_patch_pointers(image.size, region)
	if region is not None
	else np.array([])
	)
	for region in image_region
	]
	for image, image_region in zip(images, regions)
	]
	else:
	image_inputs = {}
	image_grid_thw = None

	assert videos is None, "video inputs are not supported yet" # TODO
	if videos is not None:
	videos_inputs = self.image_processor(
	images=None, videos=videos, **output_kwargs["images_kwargs"]
	)
	video_grid_thw = videos_inputs["video_grid_thw"]

	fps = output_kwargs["videos_kwargs"].pop("fps", 2.0)
	if isinstance(fps, (int, float)):
	second_per_grid_ts = [
	self.image_processor.temporal_patch_size / fps
	] * len(video_grid_thw)
	elif hasattr(fps, "__len__") and len(fps) == len(video_grid_thw):
	second_per_grid_ts = [
	self.image_processor.temporal_patch_size / tmp for tmp in fps
	]
	else:
	raise ValueError(
	f"The length of fps ({len(fps) if hasattr(fps, '__len__') else fps}) must be equal to the length of video_grid_thw ({len(video_grid_thw)}) or fps should be a single number."
	)
	videos_inputs.update({"second_per_grid_ts": second_per_grid_ts})

	else:
	videos_inputs = {}
	video_grid_thw = None

	if not isinstance(text, list):
	text = [text]

	if image_grid_thw is not None:
	merge_length = self.image_processor.merge_size**2
	index = 0
	for i in range(len(text)):
	while self.image_token in text[i]:
	text[i] = text[i].replace(
	self.image_token,
	"<\|placeholder\|>"
	* (image_grid_thw[index].prod() // merge_length),
	1,
	)
	index += 1
	text[i] = text[i].replace("<\|placeholder\|>", self.image_token)

	if obj_ptrs is not None:
	assert regions is not None
	for i in range(len(text)):
	ptrs = obj_ptrs[i]
	region = regions[i]
	assert len(ptrs) == text[i].count(self.region_token)
	index = 0
	while self.region_token in text[i]:
	ptrs_str = "".join([f"<\|copy_{j}\|>" for j in ptrs[index]])
	region_str = self.image_processor.normalize_bbox(
	image.size, region[index]
	)
	out_str = ("<region>" + ptrs_str + "</region>",)
	if self.prepend_raw_region_to_text:
	out_str = "<region>" + region_str + ptrs_str + "</region>"

	text[i] = text[i].replace(
	self.region_token,
	out_str,
	1,
	)
	index += 1

	# text[i] = text[i].replace("<\|placeholder\|>", self.region_token)

	if video_grid_thw is not None:
	# TODO: support video inputs
	merge_length = self.image_processor.merge_size**2
	index = 0
	for i in range(len(text)):
	while self.video_token in text[i]:
	text[i] = text[i].replace(
	self.video_token,
	"<patch>"
	+ "<\|placeholder\|>"
	* (video_grid_thw[index].prod() // merge_length)
	+ "</patch>",
	1,
	)
	index += 1
	text[i] = text[i].replace("<\|placeholder\|>", self.video_token)

	text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])

	return BatchFeature(data={text_inputs, image_inputs, **videos_inputs})


	def get_processor(model_name: str, **kwargs):
	processor = Qwen2_5_VLPointerProcessor.from_pretrained(model_name, **kwargs)
	processor.image_processor = Qwen2VLImagePointerProcessor.from_pretrained(
	model_name, **kwargs
	)
	# max_position_tokens = processor.tokenizer.model_max_length
	# new_tokens = [f"<\|copy_{i}\|>" for i in range(max_position_tokens)] # too slow
	processor.tokenizer.orig_vocab_size = len(processor.tokenizer)
	new_tokens = [f"<\|copy_{i}\|>" for i in range(30000)]
	processor.tokenizer.add_tokens(new_tokens)
	processor.copy_token_start = processor.tokenizer.convert_tokens_to_ids("<\|copy_0\|>")
	return processor


	# Create a data collator to encode text and image pairs
	def collate_fn(examples, processor):
	# Get the texts and images, and apply the chat template
	examples, masks = zip(*examples)
	texts = [
	processor.apply_chat_template(example, tokenize=False) for example in examples
	] # Prepare texts for processing
	image_inputs = [
	process_vision_info(example)[0][0] for example in examples
	] # Process the images to extract inputs

	# Tokenize the texts and process the images
	batch = processor(
	text=texts,
	images=image_inputs,
	videos=None,
	regions=masks,
	padding=True,
	return_tensors="pt",
	) # Encode texts and images into tensors

	# The labels are the input_ids, and we mask the padding tokens in the loss computation
	labels = batch["input_ids"].clone() # Clone input IDs for labels
	labels[labels == processor.tokenizer.pad_token_id] = (
	-100
	) # Mask padding tokens in labels

	# Ignore the image token index in the loss computation (model specific)
	if isinstance(
	processor, Qwen2VLImagePointerProcessor
	): # Check if the processor is Qwen2VLProcessor
	image_tokens = [
	151652,
	151653,
	151655,
	] # Specific image token IDs for Qwen2VLProcessor
	else:
	image_tokens = [
	processor.tokenizer.convert_tokens_to_ids(processor.image_token)
	] # Convert image token to ID

	# Mask image token IDs in the labels
	for image_token_id in image_tokens:
	labels[labels == image_token_id] = -100 # Mask image token IDs in labels

	batch["labels"] = labels # Add labels to the batch

	return batch # Return the prepared batch


	if __name__ == "__main__":
	# processor = Qwen2VLImagePointerProcessor.from_pretrained(
	# "Qwen/Qwen2.5-VL-7B-Instruct"
	# )

	# image_size = [1036, 756]
	# regions = [[0, 20, 25, 120], [512, 600, 800, 800], [0, 0, 1023, 740]]
	# processor.test(image_size, regions)

	model_name = "Qwen/Qwen2.5-VL-7B-Instruct"
	processor = get_processor(model_name)

	messages = [
	{
	"role": "user",
	"content": [
	{
	"type": "image",
	"image": "https://example---/demo.jpeg",
	},
	{"type": "text", "text": "Describe this image."},
	],
	},
	{
	"role": "assistant",
	"content": [
	{
	"type": "text",
	"text": "<think>Theres a cat at <\|region\|>, a dog at <\|region\|>.</think>A calico cat hanging out with a golden retriever.",
	}
	],
	},
	]
	image = Image.new("RGB", (800, 500), "black")
	text = processor.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)
	bboxes = [[0, 10, 100, 200], [300, 0, 600, 250]]
	inputs = processor(
	text=[text],
	images=[image],
	videos=None,
	regions=[bboxes],
	padding=True,
	return_tensors="pt",
	)
	text = processor.tokenizer.decode(inputs.input_ids[0])
	print(text)
	masks, image_size = processor.extract_masks(image.size, text)
	import ipdb; ipdb.set_trace() # noqa # fmt: skip