from transformers import AutoProcessor, CLIPModel import torch class CLIPImageEncoder: """ A class for encoding images using the CLIP model. Args: device (str): The device to run the model on (default: "cpu"). Attributes: device (str): The device to run the model on. model (CLIPModel): The CLIP model used for image encoding. processor (AutoProcessor): The tokenizer and input processor for the CLIP model. """ def __init__(self, device="cpu"): self.device = device self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") self.processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32") def encode_image(self, image_pil): """ Encodes a single image using the CLIP model. Args: image_pil: A PIL Image object representing the image to encode. Returns: numpy.ndarray: The CLIP embedding for the image. """ with torch.no_grad(): input = self.processor(images=image_pil, return_tensors="pt") image_features = self.model.get_image_features(**input) return image_features.cpu().detach().numpy()[0] def encode_images(self, batch): """ Encodes a batch of images using the CLIP model. Args: batch (Dict[str, Any]): A dictionary containing the batch of images to encode. Returns: Dict[str, Any]: A dictionary containing the CLIP embeddings for the batch of images. """ images = batch["image"] input = self.processor(images=images, return_tensors="pt") with torch.no_grad(): image_features = self.model.get_image_features(**input) return {"clip_embeddings": image_features.cpu().detach().numpy()}