Slep
/

CondViT-B16-cat

+{
+  "auto_map": {
+    "AutoImageProcessor": "processor.CondViTProcessor",
+    "AutoProcessor": "processor.CondViTProcessor"
+  },
+  "bkg_color": 255,
+  "categories": [
+    "Bags",
+    "Feet",
+    "Hands",
+    "Head",
+    "Lower Body",
+    "Neck",
+    "Outwear",
+    "Upper Body",
+    "Waist",
+    "Whole Body"
+  ],
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "CondViTProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "input_resolution": 224
+}

processor.py ADDED Viewed

	@@ -0,0 +1,91 @@

+from transformers.image_processing_utils import ImageProcessingMixin, BatchFeature
+from torchvision.transforms import transforms as tf
+import torchvision.transforms.functional as F
+from PIL import Image
+import torch
+class CondViTProcessor(ImageProcessingMixin):
+    def __init__(
+        self,
+        bkg_color=255,
+        input_resolution=224,
+        image_mean=(0.48145466, 0.4578275, 0.40821073),
+        image_std=(0.26862954, 0.26130258, 0.27577711),
+        categories=[
+            "Bags",
+            "Feet",
+            "Hands",
+            "Head",
+            "Lower Body",
+            "Neck",
+            "Outwear",
+            "Upper Body",
+            "Waist",
+            "Whole Body",
+        ],
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.bkg_color = bkg_color
+        self.input_resolution = input_resolution
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.categories = categories
+    def square_pad(self, image):
+        max_wh = max(image.size)
+        p_left, p_top = [(max_wh - s) // 2 for s in image.size]
+        p_right, p_bottom = [
+            max_wh - (s + pad) for s, pad in zip(image.size, [p_left, p_top])
+        ]
+        padding = (p_left, p_top, p_right, p_bottom)
+        return F.pad(image, padding, self.bkg_color, "constant")
+    def process_img(self, image):
+        img = self.square_pad(image)
+        img = F.resize(img, self.input_resolution)
+        img = F.to_tensor(img)
+        img = F.normalize(img, self.image_mean, self.image_std)
+        return img
+    def process_cat(self, cat):
+        if cat is not None:
+            cat = torch.tensor(self.categories.index(cat), dtype=int)
+        return cat
+    def __call__(self, images, categories=None):
+        """
+        Parameters
+        ----------
+        images : Union[Image.Image, List[Image.Image]]
+            Image or list of images to process
+        categories : Optional[Union[str, List[str]]]
+            Category or list of categories to process
+        Returns
+        -------
+        BatchFeature
+            pixel_values : torch.Tensor
+                Processed image tensor (B C H W)
+            category : torch.Tensor
+                Categories indices (B)
+        """
+        use_cats = categories is not None
+        # Single Image + Single category
+        if isinstance(images, Image.Image):
+            images = [images]
+            if use_cats:
+                categories = [categories]
+        data = {}
+        data["pixel_values"] = torch.stack([self.process_img(img) for img in images])
+        if use_cats:
+            data["category"] = torch.stack([self.process_cat(c) for c in categories])
+        return BatchFeature(data=data)