Spaces:

zzzzzeee
/

test_embedding_shape

Sleeping

App Files Files Community

zzzzzeee commited on 20 days ago

Commit

f9ff13b

verified ·

1 Parent(s): 77b3a26

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -19

app.py CHANGED Viewed

@@ -6,10 +6,11 @@ import torchvision.transforms as transforms
 from functools import partial
 from dinov2 import DinoVisionTransformer
 # 定义模型（使用 vit_small 作为示例）
-def create_model():
     model = DinoVisionTransformer(
-        img_size=224,
         patch_size=16,
         in_chans=3,
         embed_dim=384,
@@ -22,14 +23,13 @@ def create_model():
     model.eval()
     return model
-# 图像预处理
-def preprocess_image(image):
-    transform = transforms.Compose([
-        transforms.Resize((224, 224)),
-        transforms.ToTensor(),
-        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
-    ])
-    img_tensor = transform(image).unsqueeze(0)  # 添加 batch 维度
     return img_tensor
 # 修改 forward_features 以输出中间形状
@@ -41,7 +41,7 @@ def forward_with_shapes(model, x, masks=None):
     shapes.append(f"After patch_embed: {x.shape}")
     # 2. Prepare tokens with masks
-    B, nc, w, h = x.shape[0], 3, 224, 224  # 原始图像尺寸
     if masks is not None:
         x = torch.where(masks.unsqueeze(-1), model.mask_token.to(x.dtype).unsqueeze(0), x)
     x = torch.cat((model.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
@@ -88,24 +88,32 @@ def forward_with_shapes(model, x, masks=None):
     return output, shapes
 # 主处理函数
-def process_image(image):
-    model = create_model()
-    img_tensor = preprocess_image(image)
     # 前向传播并获取形状
     output, shapes = forward_with_shapes(model, img_tensor)
     # 将形状列表转换为字符串
-    shapes_text = "\n".join(shapes)
     return shapes_text
 # Gradio 界面
 demo = gr.Interface(
-    fn=process_image,
-    inputs=gr.Image(type="pil", label="Upload an Image"),
     outputs=gr.Textbox(label="Feature Map Shapes"),
     title="DinoVisionTransformer Feature Map Shapes",
-    description="Upload an image to see the shapes of feature maps at each step of DinoVisionTransformer (vit_small, 4 register tokens).",
 )
-demo.launch()

 from functools import partial
 from dinov2 import DinoVisionTransformer
 # 定义模型（使用 vit_small 作为示例）
+def create_model(height, width):
     model = DinoVisionTransformer(
+        img_size=(height, width),  # 动态调整输入尺寸
         patch_size=16,
         in_chans=3,
         embed_dim=384,
     model.eval()
     return model
+# 生成随机输入张量
+def generate_input(batch_size, channels, height, width):
+    # 确保输入尺寸是 patch_size 的整数倍
+    patch_size = 16
+    height = (height // patch_size) * patch_size
+    width = (width // patch_size) * patch_size
+    img_tensor = torch.randn(batch_size, channels, height, width)
     return img_tensor
 # 修改 forward_features 以输出中间形状
     shapes.append(f"After patch_embed: {x.shape}")
     # 2. Prepare tokens with masks
+    B, nc, w, h = x.shape[0], 3, x.shape[-2], x.shape[-1]  # 使用动态输入尺寸
     if masks is not None:
         x = torch.where(masks.unsqueeze(-1), model.mask_token.to(x.dtype).unsqueeze(0), x)
     x = torch.cat((model.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
     return output, shapes
 # 主处理函数
+def process_tensor(batch_size, channels, height, width):
+    # 生成随机输入
+    img_tensor = generate_input(batch_size, channels, height, width)
+    # 创建模型，动态调整 img_size
+    model = create_model(height=img_tensor.shape[2], width=img_tensor.shape[3])
     # 前向传播并获取形状
     output, shapes = forward_with_shapes(model, img_tensor)
     # 将形状列表转换为字符串
+    shapes_text = f"Input shape: {img_tensor.shape}\n" + "\n".join(shapes)
     return shapes_text
 # Gradio 界面
 demo = gr.Interface(
+    fn=process_tensor,
+    inputs=[
+        gr.Slider(minimum=1, maximum=8, step=1, value=1, label="Batch Size"),
+        gr.Slider(minimum=1, maximum=8, step=1, value=3, label="Channels"),
+        gr.Slider(minimum=16, maximum=512, step=16, value=224, label="Height"),
+        gr.Slider(minimum=16, maximum=512, step=16, value=224, label="Width"),
+    ],
     outputs=gr.Textbox(label="Feature Map Shapes"),
     title="DinoVisionTransformer Feature Map Shapes",
+    description="Adjust the sliders to set the input tensor dimensions (B, C, H, W) and see the shapes of feature maps at each step of DinoVisionTransformer (vit_small, 4 register tokens). Height and Width will be adjusted to be multiples of 16.",
 )
+demo.launch()