from transformers import AutoProcessor, CLIPModel import torch class CLIPImageEncoder: def __init__(self, device="cpu"): self.device = device self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") self.processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32") def encode_image(self, image_pil): with torch.no_grad(): input = self.processor(images=image_pil, return_tensors="pt") image_features = self.model.get_image_features(**input) return image_features.cpu().detach().numpy()[0] def encode_images(self, batch): images = batch["image"] print(images) input = self.processor(images=images, return_tensors="pt") with torch.no_grad(): #image_features = self.model.get_image_features(**input) image_features = self.model(**input).last_hidden_state[:,0].cpu() print(image_features) print("--------------------") print(self.model.get_image_features(**input).cpu().detach().numpy()[0]) return {"clip_embeddings": image_features}