Spaces:

BenkHel
/

CumoThesis

Running

BenkHel commited on 26 days ago

Commit

98866e7

verified ·

1 Parent(s): 0990d31

Update cumo/model/multimodal_encoder/clip_encoder.py

Files changed (1) hide show

cumo/model/multimodal_encoder/clip_encoder.py CHANGED Viewed

@@ -86,7 +86,8 @@ class CLIPVisionTower(nn.Module):
             for image in images:
                 dev = image.device if hasattr(image, "device") else torch.device("cuda" if torch.cuda.is_available() else "cpu")
                 dt = image.dtype if hasattr(image, "dtype") else torch.float16
-                image_forward_out = self.vision_model(image.to(device=dev, dtype=dt).unsqueeze(0))
                 image_feature = self.feature_select(image_forward_out).to(image.dtype)
                 image_features.append(image_feature)

             for image in images:
                 dev = image.device if hasattr(image, "device") else torch.device("cuda" if torch.cuda.is_available() else "cpu")
                 dt = image.dtype if hasattr(image, "dtype") else torch.float16
+                print("Image shape before vision_model:", image.shape)
+                image_forward_out = self.vision_model(image.to(device=dev, dtype=dt))
                 image_feature = self.feature_select(image_forward_out).to(image.dtype)
                 image_features.append(image_feature)