Spaces:

sayedM
/

DINOv3-PCA-visualization

Running

App Files Files Community

sayedM commited on 5 days ago

Commit

364c029

verified ·

1 Parent(s): e55e82c

Create app.py

Browse files

Files changed (1) hide show

app.py +194 -0

app.py ADDED Viewed

	@@ -0,0 +1,194 @@

+import torch
+import gradio as gr
+import numpy as np
+from PIL import Image
+import torchvision.transforms.functional as TF
+from matplotlib import colormaps
+from transformers import AutoModel
+# ----------------------------
+# Configuration
+# ----------------------------
+# The model will be downloaded from the Hugging Face Hub
+MODEL_ID = "facebook/dinov3-vith16plus-pretrain-lvd1689m"
+PATCH_SIZE = 16
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# Normalization constants
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+# ----------------------------
+# Model Loading (runs once at startup)
+# ----------------------------
+def load_model_from_hub():
+    """Loads the DINOv3 model from the Hugging Face Hub."""
+    print(f"Loading model '{MODEL_ID}' from Hugging Face Hub...")
+    try:
+        model = AutoModel.from_pretrained(MODEL_ID)
+        model.to(DEVICE).eval()
+        print(f"✅ Model loaded successfully on device: {DEVICE}")
+        return model
+    except Exception as e:
+        print(f"❌ Failed to load model: {e}")
+        gr.Error(f"Could not load model from Hub: {e}")
+        return None
+# Load the model globally when the app starts
+model = load_model_from_hub()
+# ----------------------------
+# Helper Functions
+# ----------------------------
+def resize_to_grid(img: Image.Image, long_side: int, patch: int) -> torch.Tensor:
+    """Resizes an image to dimensions that are multiples of the patch size."""
+    w, h = img.size
+    scale = long_side / max(h, w)
+    new_h = max(patch, int(round(h * scale)))
+    new_w = max(patch, int(round(w * scale)))
+    new_h = ((new_h + patch - 1) // patch) * patch
+    new_w = ((new_w + patch - 1) // patch) * patch
+    return TF.to_tensor(TF.resize(img.convert("RGB"), (new_h, new_w)))
+def colorize(data: np.ndarray, cmap_name: str = 'viridis') -> Image.Image:
+    """Converts a 2D numpy array to a colored PIL image."""
+    x = data.astype(np.float32)
+    x = (x - x.min()) / (x.max() - x.min() + 1e-8)
+    cmap = colormaps.get_cmap(cmap_name)
+    rgb = (cmap(x)[..., :3] * 255).astype(np.uint8)
+    return Image.fromarray(rgb)
+def blend(base: Image.Image, heat: Image.Image, alpha: float) -> Image.Image:
+    """Blends a heatmap onto a base image."""
+    base = base.convert("RGBA")
+    heat = heat.convert("RGBA")
+    return Image.blend(base, heat, alpha=alpha)
+# ----------------------------
+# Core Gradio Function
+# ----------------------------
+@torch.inference_mode()
+def generate_pca_visuals(
+    image_pil: Image.Image,
+    resolution: int,
+    cmap_name: str,
+    overlay_alpha: float,
+    progress=gr.Progress(track_tqdm=True)
+):
+    """Main function to generate PCA visuals."""
+    if model is None:
+        raise gr.Error("DINOv3 model could not be loaded. Check the logs.")
+    if image_pil is None:
+        return None, None, "Please upload an image and click Generate.", None, None
+    # 1. Image Preprocessing
+    progress(0.2, desc="Resizing and preprocessing image...")
+    image_tensor = resize_to_grid(image_pil, resolution, PATCH_SIZE)
+    t_norm = TF.normalize(image_tensor, IMAGENET_MEAN, IMAGENET_STD).unsqueeze(0).to(DEVICE)
+    original_processed_image = TF.to_pil_image(image_tensor)
+    _, _, H, W = t_norm.shape
+    Hp, Wp = H // PATCH_SIZE, W // PATCH_SIZE
+    # 2. Feature Extraction
+    progress(0.5, desc="🦖 Extracting features with DINOv3...")
+    outputs = model(t_norm)
+    # The patch embeddings are in last_hidden_state, we skip the first token (CLS)
+    patch_embeddings = outputs.last_hidden_state.squeeze(0)[1:, :]
+    # 3. PCA Calculation
+    progress(0.8, desc="🔬 Performing PCA...")
+    X_centered = patch_embeddings.float() - patch_embeddings.float().mean(0, keepdim=True)
+    U, S, V = torch.pca_lowrank(X_centered, q=3, center=False)
+    # Stabilize the signs of the eigenvectors for deterministic output
+    for i in range(V.shape[1]):
+        max_abs_idx = torch.argmax(torch.abs(V[:, i]))
+        if V[max_abs_idx, i] < 0:
+            V[:, i] *= -1
+    scores = X_centered @ V[:, :3]
+    # 4. Explained Variance
+    total_variance = (X_centered ** 2).sum()
+    explained_variance = [float((s**2) / total_variance) for s in S]
+    variance_text = (
+        f"**📊 Explained Variance Ratios:**\n\n"
+        f"- **PC1:** {explained_variance[0]:.2%}\n"
+        f"- **PC2:** {explained_variance[1]:.2%}\n"
+        f"- **PC3:** {explained_variance[2]:.2%}"
+    )
+    # 5. Create Visualizations
+    pc1_map = scores[:, 0].reshape(Hp, Wp).cpu().numpy()
+    pc1_image_raw = colorize(pc1_map, cmap_name)
+    pc_rgb_map = scores.reshape(Hp, Wp, 3).cpu().numpy()
+    min_vals = pc_rgb_map.reshape(-1, 3).min(axis=0)
+    max_vals = pc_rgb_map.reshape(-1, 3).max(axis=0)
+    pc_rgb_map = (pc_rgb_map - min_vals) / (max_vals - min_vals + 1e-8)
+    pc_rgb_image_raw = Image.fromarray((pc_rgb_map * 255).astype(np.uint8))
+    target_size = original_processed_image.size
+    pc1_image_smooth = pc1_image_raw.resize(target_size, Image.Resampling.BICUBIC)
+    pc_rgb_image_smooth = pc_rgb_image_raw.resize(target_size, Image.Resampling.BICUBIC)
+    blended_image = blend(original_processed_image, pc1_image_smooth, overlay_alpha)
+    progress(1.0, desc="✅ Done!")
+    return pc1_image_smooth, pc_rgb_image_smooth, variance_text, blended_image, original_processed_image
+# ----------------------------
+# Gradio Interface
+# ----------------------------
+with gr.Blocks(theme=gr.themes.Soft(), title="DINOv3 PCA Explorer") as demo:
+    gr.Markdown(
+        """
+        # 🦖 DINOv3 PCA Explorer
+        Upload an image to visualize the principal components of its patch features.
+        This reveals the main axes of semantic variation within the image as understood by the model.
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=2):
+            input_image = gr.Image(type="pil", label="Upload Image", value="https://picsum.photos/id/1011/800/600")
+            with gr.Accordion("⚙️ Visualization Controls", open=True):
+                resolution_slider = gr.Slider(
+                    minimum=224, maximum=1024, value=512, step=16,
+                    label="Processing Resolution",
+                    info="Higher values capture more detail but are slower."
+                )
+                cmap_dropdown = gr.Dropdown(
+                    ['viridis', 'magma', 'inferno', 'plasma', 'cividis', 'jet'],
+                    value='viridis',
+                    label="Heatmap Colormap"
+                )
+                alpha_slider = gr.Slider(
+                    minimum=0, maximum=1, value=0.5,
+                    label="Overlay Opacity"
+                )
+            run_button = gr.Button("🚀 Generate PCA Visuals", variant="primary")
+        with gr.Column(scale=3):
+            with gr.Tabs():
+                with gr.TabItem("🖼️ Overlay"):
+                    gr.Markdown("Visualize the main heatmap blended with the original image.")
+                    output_blended = gr.Image(label="PC1 Heatmap Overlay")
+                    output_processed = gr.Image(label="Original Processed Image (at selected resolution)")
+                with gr.TabItem("📊 PCA Outputs"):
+                    gr.Markdown("View the raw outputs of the Principal Component Analysis.")
+                    output_pc1 = gr.Image(label="PC1 Heatmap (Smoothed)")
+                    output_rgb = gr.Image(label="Top 3 PCs as RGB (Smoothed)")
+                    output_variance = gr.Markdown(label="Explained Variance")
+    run_button.click(
+        fn=generate_pca_visuals,
+        inputs=[input_image, resolution_slider, cmap_dropdown, alpha_slider],
+        outputs=[output_pc1, output_rgb, output_variance, output_blended, output_processed]
+    )
+if __name__ == "__main__":
+    demo.launch()