Spaces:

samuelstevens
/

saev-semantic-segmentation

Running

App Files Files Community

Samuel Stevens commited on Feb 2

Commit

699b9c3

1 Parent(s): 0ab58fa

bug: SAE examples are not highlighted

Browse files

Files changed (2) hide show

app.py +110 -157
modeling.py +53 -0

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import functools
 import io
 import json
 import logging
-import os.path
 import pathlib
 import typing
@@ -10,17 +10,19 @@ import beartype
 import einops
 import einops.layers.torch
 import gradio as gr
 import saev.activations
 import saev.config
 import saev.nn
 import saev.visuals
 import torch
-from jaxtyping import Float, Int, UInt8, jaxtyped
-from PIL import Image
 from torch import Tensor
 import constants
 import data
 logger = logging.getLogger("app.py")
@@ -29,33 +31,26 @@ logger = logging.getLogger("app.py")
 ####################
-DEBUG = False
-"""Whether we are debugging."""
-max_frequency = 1e-2
 """Maximum frequency. Any feature that fires more than this is ignored."""
-n_sae_latents = 3
-"""Number of SAE latents to show."""
-n_sae_examples = 4
-"""Number of SAE examples per latent to show."""
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-"""Hardware accelerator, if any."""
 RESIZE_SIZE = 512
 """Resize shorter size to this size in pixels."""
 CROP_SIZE = (448, 448)
 """Crop size in pixels."""
-DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 """Hardware accelerator, if any."""
 CWD = pathlib.Path(".")
 """Current working directory."""
 ##########
 # Models #
@@ -63,27 +58,7 @@ CWD = pathlib.Path(".")
 @functools.cache
-def load_vit() -> tuple[saev.activations.WrappedVisionTransformer, typing.Callable]:
-    vit = (
-        saev.activations.WrappedVisionTransformer(
-            saev.config.Activations(
-                model_family="dinov2",
-                model_ckpt="dinov2_vitb14_reg",
-                layers=[-2],
-                n_patches_per_img=256,
-            )
-        )
-        .to(DEVICE)
-        .eval()
-    )
-    vit_transform = saev.activations.make_img_transform("dinov2", "dinov2_vitb14_reg")
-    logger.info("Loaded ViT.")
-    return vit, vit_transform
-@functools.cache
-def load_sae() -> saev.nn.SparseAutoencoder:
     """
     Loads a sparse autoencoder from disk.
     """
@@ -102,37 +77,12 @@ def load_clf() -> torch.nn.Module:
         buffer = io.BytesIO(fd.read())
     model = torch.nn.Linear(**kwargs)
-    state_dict = torch.load(buffer, weights_only=True, map_location=device)
     model.load_state_dict(state_dict)
-    model = model.to(device).eval()
     return model
-class RestOfDinoV2(torch.nn.Module):
-    def __init__(self, *, n_end_layers: int):
-        super().__init__()
-        self.vit = torch.hub.load("facebookresearch/dinov2", "dinov2_vitb14_reg")
-        self.n_end_layers = n_end_layers
-    def forward_start(self, x: Float[Tensor, "batch channels width height"]):
-        x_BPD = self.vit.prepare_tokens_with_masks(x)
-        for blk in self.vit.blocks[: -self.n_end_layers]:
-            x_BPD = blk(x_BPD)
-        return x_BPD
-    def forward_end(self, x_BPD: Float[Tensor, "batch n_patches dim"]):
-        for blk in self.vit.blocks[-self.n_end_layers :]:
-            x_BPD = blk(x_BPD)
-        x_BPD = self.vit.norm(x_BPD)
-        return x_BPD[:, self.vit.num_register_tokens + 1 :]
-rest_of_vit = RestOfDinoV2(n_end_layers=1)
-rest_of_vit = rest_of_vit.to(device)
 ####################
 # Global Variables #
 ####################
@@ -143,13 +93,23 @@ def load_tensor(path: str | pathlib.Path) -> Tensor:
     return torch.load(path, weights_only=True, map_location="cpu")
-# top_img_i = load_tensor(CWD / "assets" / "top_img_i.pt")
-# top_values = load_tensor(CWD / "assets" / "top_values_uint8.pt")
-# sparsity = load_tensor(CWD / "assets" / "sparsity.pt")
-# mask = torch.ones((sae.cfg.d_sae), dtype=bool)
-# mask = mask & (sparsity < max_frequency)
 ############
@@ -157,37 +117,42 @@ def load_tensor(path: str | pathlib.Path) -> Tensor:
 ############
-# in1k_dataset = saev.activations.get_dataset(
-#     saev.config.ImagenetDataset(),
-#     img_transform=v2.Compose([
-#         v2.Resize(size=(512, 512)),
-#         v2.CenterCrop(size=(448, 448)),
-#     ]),
-# )
-# acts_dataset = saev.activations.Dataset(
-#     saev.config.DataLoad(
-#         shard_root="/local/scratch/stevens.994/cache/saev/a1f842330bb568b2fb05c15d4fa4252fb7f5204837335000d9fd420f120cd03e",
-#         scale_mean=not DEBUG,
-#         scale_norm=not DEBUG,
-#         layer=-2,
-#     )
-# )
-# vit_dataset = saev.activations.Ade20k(
-#     saev.config.Ade20kDataset(
-#         root="/research/nfs_su_809/workspace/stevens.994/datasets/ade20k/"
-#     ),
-#     img_transform=v2.Compose([
-#         v2.Resize(size=(256, 256)),
-#         v2.CenterCrop(size=(224, 224)),
-#         v2.ToImage(),
-#         v2.ToDtype(torch.float32, scale=True),
-#         v2.Normalize(mean=[0.4850, 0.4560, 0.4060], std=[0.2290, 0.2240, 0.2250]),
-#     ]),
-# )
 #######################
@@ -202,12 +167,14 @@ class Example(typing.TypedDict):
     Used to store examples of SAE latent activations for visualization.
     """
     orig_url: str
     """The URL or path to access the original example image."""
     highlighted_url: str
     """The URL or path to access the SAE-highlighted image."""
-    index: int
-    """Dataset index."""
 @beartype.beartype
@@ -249,64 +216,73 @@ def get_sae_activations(image_i: int, patches: list[int]) -> list[SaeActivation]
     if not patches:
         return []
-    vit, vit_transform = load_vit()
-    sae = load_sae()
     img = data.get_image(image_i)
-    x = vit_transform(img)[None, ...].to(DEVICE)
-    _, vit_acts_BLPD = vit(x)
-    vit_acts_PD = (
-        vit_acts_BLPD[0, 0, 1:].to(DEVICE).clamp(-1e-5, 1e5)
-        - (constants.DINOV2_IMAGENET1K_MEAN).to(DEVICE)
     ) / constants.DINOV2_IMAGENET1K_SCALAR
-    _, f_x_PS, _ = sae(vit_acts_PD)
-    # Ignore [CLS] token and get just the requested latents.
-    acts_SP = einops.rearrange(f_x_PS, "patches n_latents -> n_latents patches")
-    logger.info("Got SAE activations.")
-    top_img_i, top_values = load_tensors(model_cfg)
-    logger.info("Loaded top SAE activations for '%s'.", model_name)
-    vit_acts_MD = torch.stack([
-        acts_dataset[image_i * acts_dataset.metadata.n_patches_per_img + i]["act"]
-        for i in patches
-    ]).to(device)
-    _, f_x_MS, _ = sae(vit_acts_MD)
-    f_x_S = f_x_MS.sum(axis=0)
     latents = torch.argsort(f_x_S, descending=True).cpu()
-    latents = latents[mask[latents]][:n_sae_latents].tolist()
-    images = []
     for latent in latents:
-        elems, seen_i_im = [], set()
         for i_im, values_p in zip(top_img_i[latent].tolist(), top_values[latent]):
             if i_im in seen_i_im:
                 continue
-            example = in1k_dataset[i_im]
-            elems.append(
-                saev.visuals.GridElement(example["image"], example["label"], values_p)
-            )
             seen_i_im.add(i_im)
         # How to scale values.
         upper = None
         if top_values[latent].numel() > 0:
             upper = top_values[latent].max().item()
-        latent_images = [make_img(elem, upper=upper) for elem in elems[:n_sae_examples]]
-        while len(latent_images) < n_sae_examples:
-            latent_images += [None]
-        images.extend(latent_images)
-    return images + latents
 @torch.inference_mode
@@ -416,29 +392,6 @@ def upsample(
     )
-@beartype.beartype
-def make_img(
-    elem: saev.visuals.GridElement, *, upper: float | None = None
-) -> Image.Image:
-    # Resize to 256x256 and crop to 224x224
-    resize_size_px = (512, 512)
-    resize_w_px, resize_h_px = resize_size_px
-    crop_size_px = (448, 448)
-    crop_w_px, crop_h_px = crop_size_px
-    crop_coords_px = (
-        (resize_w_px - crop_w_px) // 2,
-        (resize_h_px - crop_h_px) // 2,
-        (resize_w_px + crop_w_px) // 2,
-        (resize_h_px + crop_h_px) // 2,
-    )
-    img = elem.img.resize(resize_size_px).crop(crop_coords_px)
-    img = saev.imaging.add_highlights(
-        img, elem.patches.numpy(), upper=upper, opacity=0.5
-    )
-    return img
 with gr.Blocks() as demo:
     image_number = gr.Number(label="Validation Example")

 import io
 import json
 import logging
+import math
 import pathlib
 import typing
 import einops
 import einops.layers.torch
 import gradio as gr
+import numpy as np
 import saev.activations
 import saev.config
 import saev.nn
 import saev.visuals
 import torch
+from jaxtyping import Bool, Float, Int, UInt8, jaxtyped
+from PIL import Image, ImageDraw
 from torch import Tensor
 import constants
 import data
+import modeling
 logger = logging.getLogger("app.py")
 ####################
+MAX_FREQ = 1e-2
 """Maximum frequency. Any feature that fires more than this is ignored."""
 RESIZE_SIZE = 512
 """Resize shorter size to this size in pixels."""
 CROP_SIZE = (448, 448)
 """Crop size in pixels."""
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 """Hardware accelerator, if any."""
 CWD = pathlib.Path(".")
 """Current working directory."""
+N_SAE_LATENTS = 3
+"""Number of SAE latents to show."""
+N_LATENT_EXAMPLES = 4
+"""Number of examples per SAE latent to show."""
 ##########
 # Models #
 @functools.cache
+def load_sae(device: str) -> saev.nn.SparseAutoencoder:
     """
     Loads a sparse autoencoder from disk.
     """
         buffer = io.BytesIO(fd.read())
     model = torch.nn.Linear(**kwargs)
+    state_dict = torch.load(buffer, weights_only=True, map_location=DEVICE)
     model.load_state_dict(state_dict)
+    model = model.to(DEVICE).eval()
     return model
 ####################
 # Global Variables #
 ####################
     return torch.load(path, weights_only=True, map_location="cpu")
+@functools.cache
+def load_tensors() -> tuple[
+    Int[Tensor, "d_sae k"],
+    UInt8[Tensor, "d_sae k n_patches"],
+    Bool[Tensor, " d_sae"],
+]:
+    """
+    Loads the tensors for the SAE for ADE20K.
+    """
+    top_img_i = load_tensor(CWD / "assets" / "top_img_i.pt")
+    top_values = load_tensor(CWD / "assets" / "top_values_uint8.pt")
+    sparsity = load_tensor(CWD / "assets" / "sparsity.pt")
+    mask = torch.ones(sparsity.shape, dtype=bool)
+    mask = mask & (sparsity < MAX_FREQ)
+    return top_img_i, top_values, mask
 ############
 ############
+@jaxtyped(typechecker=beartype.beartype)
+def add_highlights(
+    img: Image.Image,
+    patches: Float[np.ndarray, " n_patches"],
+    *,
+    upper: int | None = None,
+    opacity: float = 0.9,
+) -> Image.Image:
+    breakpoint()
+    if not len(patches):
+        return img
+    iw_np, ih_np = int(math.sqrt(len(patches))), int(math.sqrt(len(patches)))
+    iw_px, ih_px = img.size
+    pw_px, ph_px = iw_px // iw_np, ih_px // ih_np
+    assert iw_np * ih_np == len(patches)
+    # Create a transparent overlay
+    overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
+    draw = ImageDraw.Draw(overlay)
+    # Using semi-transparent red (255, 0, 0, alpha)
+    for p, val in enumerate(patches):
+        assert upper is not None
+        val /= upper + 1e-9
+        x_np, y_np = p % iw_np, p // ih_np
+        draw.rectangle(
+            [
+                (x_np * pw_px, y_np * ph_px),
+                (x_np * pw_px + pw_px, y_np * ph_px + ph_px),
+            ],
+            fill=(int(val * 256), 0, 0, int(opacity * val * 256)),
+        )
+    # Composite the original image and the overlay
+    return Image.alpha_composite(img.convert("RGBA"), overlay)
 #######################
     Used to store examples of SAE latent activations for visualization.
     """
+    index: int
+    """Dataset index."""
     orig_url: str
     """The URL or path to access the original example image."""
     highlighted_url: str
     """The URL or path to access the SAE-highlighted image."""
+    seg_url: str
+    """Base64-encoded version of the colored segmentation map."""
 @beartype.beartype
     if not patches:
         return []
+    split_vit, vit_transform = modeling.load_vit(DEVICE)
+    sae = load_sae(DEVICE)
     img = data.get_image(image_i)
+    x_BCWH = vit_transform(img)[None, ...].to(DEVICE)
+    x_BPD = split_vit.forward_start(x_BCWH)
+    x_BPD = (
+        x_BPD.clamp(-1e-5, 1e5) - (constants.DINOV2_IMAGENET1K_MEAN).to(DEVICE)
     ) / constants.DINOV2_IMAGENET1K_SCALAR
+    # Need to pick out the right patches
+    # + 1 + 4 for 1 [CLS] token and 4 register tokens
+    x_PD = x_BPD[0, [p + 1 + 4 for p in patches]]
+    _, f_x_PS, _ = sae(x_PD)
+    f_x_S = einops.reduce(f_x_PS, "patches n_latents -> n_latents", "sum")
+    logger.info("Got SAE activations.")
+    top_img_i, top_values, mask = load_tensors()
     latents = torch.argsort(f_x_S, descending=True).cpu()
+    latents = latents[mask[latents]][:N_SAE_LATENTS].tolist()
+    sae_activations = []
     for latent in latents:
+        pairs, seen_i_im = [], set()
         for i_im, values_p in zip(top_img_i[latent].tolist(), top_values[latent]):
             if i_im in seen_i_im:
                 continue
+            pairs.append((i_im, values_p))
             seen_i_im.add(i_im)
+            if len(pairs) >= N_LATENT_EXAMPLES:
+                break
         # How to scale values.
         upper = None
         if top_values[latent].numel() > 0:
             upper = top_values[latent].max().item()
+        examples = []
+        for i_im, values_p in pairs:
+            seg_sized = data.to_sized(data.get_seg(i_im))
+            img_sized = data.to_sized(data.get_image(i_im))
+            seg_u8_sized = data.to_u8(seg_sized)
+            seg_img_sized = data.u8_to_img(seg_u8_sized)
+            highlighted_sized = add_highlights(
+                img_sized, values_p.float().numpy(), upper=upper
+            )
+            examples.append({
+                "index": i_im,
+                "orig_url": data.img_to_base64(img_sized),
+                "highlighted_url": data.img_to_base64(highlighted_sized),
+                "seg_url": data.img_to_base64(seg_img_sized),
+            })
+        sae_activations.append({
+            "latent": latent,
+            "examples": examples,
+        })
+    return sae_activations
 @torch.inference_mode
     )
 with gr.Blocks() as demo:
     image_number = gr.Number(label="Validation Example")

modeling.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import functools
+import logging
+import typing
+import beartype
+import torch
+from jaxtyping import Float, jaxtyped
+from torch import Tensor
+from torchvision.transforms import v2
+logger = logging.getLogger("modeling.py")
+@jaxtyped(typechecker=beartype.beartype)
+class SplitDinov2(torch.nn.Module):
+    def __init__(self, *, split_at: int):
+        super().__init__()
+        self.vit = torch.hub.load("facebookresearch/dinov2", "dinov2_vitb14_reg").eval()
+        self.split_at = split_at
+    def forward_start(
+        self, x: Float[Tensor, "batch channels width height"]
+    ) -> Float[Tensor, "batch patches dim"]:
+        x_BPD = self.vit.prepare_tokens_with_masks(x)
+        for blk in self.vit.blocks[: self.split_at]:
+            x_BPD = blk(x_BPD)
+        return x_BPD
+    def forward_end(
+        self, x_BPD: Float[Tensor, "batch n_patches dim"]
+    ) -> Float[Tensor, "batch patches dim"]:
+        for blk in self.vit.blocks[-self.split_at :]:
+            x_BPD = blk(x_BPD)
+        x_BPD = self.vit.norm(x_BPD)
+        return x_BPD[:, self.vit.num_register_tokens + 1 :]
+@functools.cache
+def load_vit(device: str) -> tuple[SplitDinov2, typing.Callable]:
+    vit = SplitDinov2(split_at=11).to(device)
+    vit_transform = v2.Compose([
+        v2.Resize(size=(256, 256)),
+        v2.CenterCrop(size=(224, 224)),
+        v2.ToImage(),
+        v2.ToDtype(torch.float32, scale=True),
+        v2.Normalize(mean=[0.4850, 0.4560, 0.4060], std=[0.2290, 0.2240, 0.2250]),
+    ])
+    logger.info("Loaded ViT.")
+    return vit, vit_transform