from transformers import AutoProcessor, CLIPModel import torch class CLIPImageEncoder: def __init__(self, device="cpu"): self.device = device self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") self.processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32") def encode_image(self, image_pil): with torch.no_grad(): input = self.processor(images=image_pil, return_tensors="pt") image_features = self.model.get_image_features(**input) return image_features.cpu().detach().numpy()[0] def encode_images(self, batch): images = batch["image"] input = self.processor(images=images, return_tensors="pt") with torch.no_grad(): image_features = self.model.get_image_features(**input) return {"clip_embeddings": image_features.cpu().detach().numpy()}