Spaces:

surokpro2
/

Unboxing_SDXL_with_SAEs

Running on Zero

App Files Files Community

surokpro2 commited on Oct 28, 2024

Commit

8cd00a9

verified ·

1 Parent(s): c65c6b4

Upload folder using huggingface_hub

Browse files

Files changed (38) hide show

.gitattributes +1 -0
.gitignore +164 -0
LICENSE +21 -0
README.MD +68 -0
README.md +3 -9
SAE/__init__.py +1 -0
SAE/config.json +23 -0
SAE/dataset_iterator.py +53 -0
SAE/sae.py +216 -0
SAE/sae_utils.py +47 -0
SDLens/__init__.py +1 -0
SDLens/hooked_scheduler.py +40 -0
SDLens/hooked_sd_pipeline.py +319 -0
app.ipynb +0 -0
app.py +399 -0
checkpoints/unet.down_blocks.2.attentions.1_k10_hidden5120_auxk256_bs4096_lr0.0001/final/config.json +1 -0
checkpoints/unet.down_blocks.2.attentions.1_k10_hidden5120_auxk256_bs4096_lr0.0001/final/mean.pt +3 -0
checkpoints/unet.down_blocks.2.attentions.1_k10_hidden5120_auxk256_bs4096_lr0.0001/final/state_dict.pth +3 -0
checkpoints/unet.down_blocks.2.attentions.1_k10_hidden5120_auxk256_bs4096_lr0.0001/final/std.pt +3 -0
checkpoints/unet.mid_block.attentions.0_k10_hidden5120_auxk256_bs4096_lr0.0001/final/config.json +1 -0
checkpoints/unet.mid_block.attentions.0_k10_hidden5120_auxk256_bs4096_lr0.0001/final/mean.pt +3 -0
checkpoints/unet.mid_block.attentions.0_k10_hidden5120_auxk256_bs4096_lr0.0001/final/state_dict.pth +3 -0
checkpoints/unet.mid_block.attentions.0_k10_hidden5120_auxk256_bs4096_lr0.0001/final/std.pt +3 -0
checkpoints/unet.up_blocks.0.attentions.0_k10_hidden5120_auxk256_bs4096_lr0.0001/final/config.json +1 -0
checkpoints/unet.up_blocks.0.attentions.0_k10_hidden5120_auxk256_bs4096_lr0.0001/final/mean.pt +3 -0
checkpoints/unet.up_blocks.0.attentions.0_k10_hidden5120_auxk256_bs4096_lr0.0001/final/state_dict.pth +3 -0
checkpoints/unet.up_blocks.0.attentions.0_k10_hidden5120_auxk256_bs4096_lr0.0001/final/std.pt +3 -0
checkpoints/unet.up_blocks.0.attentions.1_k10_hidden5120_auxk256_bs4096_lr0.0001/final/config.json +1 -0
checkpoints/unet.up_blocks.0.attentions.1_k10_hidden5120_auxk256_bs4096_lr0.0001/final/mean.pt +3 -0
checkpoints/unet.up_blocks.0.attentions.1_k10_hidden5120_auxk256_bs4096_lr0.0001/final/state_dict.pth +3 -0
checkpoints/unet.up_blocks.0.attentions.1_k10_hidden5120_auxk256_bs4096_lr0.0001/final/std.pt +3 -0
example.ipynb +0 -0
requirements.txt +7 -0
resourses/image.png +3 -0
scripts/collect_latents_dataset.py +96 -0
scripts/train_sae.py +308 -0
utils/__init__.py +1 -0
utils/hooks.py +45 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+resourses/image.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,164 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+wandb/

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Viacheslav Surkov
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.MD ADDED Viewed

	@@ -0,0 +1,68 @@

+# Unpacking SDXL Turbo: Interpreting Text-to-Image Models with Sparse Autoencoders
+![modification demostration](resourses/image.png)
+This repository contains code to reproduce results from our paper on using sparse autoencoders (SAEs) to analyze and interpret the internal representations of text-to-image diffusion models, specifically SDXL Turbo.
+## Repository Structure
+```
+|-- SAE/                    # Core sparse autoencoder implementation
+|-- SDLens/                 # Tools for analyzing diffusion models
+|   `-- hooked_sd_pipeline.py   # Modified stable diffusion pipeline
+|-- scripts/
+|   |-- collect_latents_dataset.py  # Generate training data
+|   `-- train_sae.py                    # Train SAE models
+|-- utils/
+|   `-- hooks.py           # Hook utility functions
+|-- checkpoints/           # Pretrained SAE model checkpoints
+|-- app.py                # Demo application
+|-- app.ipynb             # Interactive notebook demo
+|-- example.ipynb         # Usage examples
+`-- requirements.txt      # Python dependencies
+```
+## Installation
+```bash
+pip install -r requirements.txt
+```
+## Demo Application
+You can try our gradio demo application (`app.ipynb`) to browse and experiment with 20K+ features of our trained SAEs out-of-the-box. You can find the same notebook on [Google Colab](https://colab.research.google.com/drive/1Sd-g3w2Fwv7pc_fxgeQOR3S_RKr18qMP?usp=sharing).
+## Usage
+1. Collect latent data from SDXL Turbo:
+```bash
+python scripts/collect_latents_dataset.py --save_path={your_save_path}
+```
+2. Train sparse autoencoders:
+    2.1. Insert the path of stored latents and directory to store checkpoints in `SAE/config.json`
+    2.2. Run the training script:
+```bash
+python scripts/train_sae.py
+```
+## Pretrained Models
+We provide pretrained SAE checkpoints for 4 key transformer blocks in SDXL Turbo's U-Net. See `example.ipynb` for analysis examples and visualization of learned features.
+## Citation
+If you find this code useful in your research, please cite our paper:
+```bibtex
+[Citation placeholder]
+```
+## Acknowledgements
+The SAE component was implemented based on [`openai/sparse_autoencoder`](https://github.com/openai/sparse_autoencoder) repository.

README.md CHANGED Viewed

@@ -1,12 +1,6 @@
 ---
-title: Unboxing SDXL With SAEs
-emoji: 🦀
-colorFrom: red
-colorTo: gray
-sdk: gradio
-sdk_version: 5.4.0
 app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Unboxing_SDXL_with_SAEs
 app_file: app.py
+sdk: gradio
+sdk_version: 4.44.1
 ---

SAE/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .sae import SparseAutoencoder

SAE/config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+    "sae_configs": [
+        {
+            "d_model": 1280,
+            "n_dirs": 5120,
+            "k": 20
+        },
+        {
+            "d_model": 1280,
+            "n_dirs": 640,
+            "k": 20
+        }
+    ],
+    "bs": 4096,
+    "log_interval": 500,
+    "save_interval": 5000,
+    "paths_to_latents": [
+        "PASS YOUR PATHS HERE. Example /home/username/latents/<timestamp>. It should contain tar archives with latents."
+    ],
+    "save_path_base": "<Your SAE save path>",
+    "block_name": "unet.down_blocks.2.attentions.1"
+}

SAE/dataset_iterator.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import webdataset as wds
+import os
+import torch
+class ActivationsDataloader:
+    def __init__(self, paths_to_datasets, block_name, batch_size, output_or_diff='diff', num_in_buffer=50):
+        assert output_or_diff in ['diff', 'output'], "Provide 'output' or 'diff'"
+        self.dataset = wds.WebDataset(
+            [os.path.join(path_to_dataset, f"{block_name}.tar")
+            for path_to_dataset in paths_to_datasets]
+        ).decode("torch")
+        self.iter = iter(self.dataset)
+        self.buffer = None
+        self.pointer = 0
+        self.num_in_buffer = num_in_buffer
+        self.output_or_diff = output_or_diff
+        self.batch_size = batch_size
+        self.one_size = None
+    def renew_buffer(self, to_retrieve):
+        to_merge = []
+        if self.buffer is not None and self.buffer.shape[0] > self.pointer:
+            to_merge = [self.buffer[self.pointer:].clone()]
+        del self.buffer
+        for _ in range(to_retrieve):
+            sample = next(self.iter)
+            latents = sample['output.pth'] if self.output_or_diff == 'output' else sample['diff.pth']
+            latents = latents.permute((0, 1, 3, 4, 2))
+            latents = latents.reshape((-1, latents.shape[-1]))
+            to_merge.append(latents.to('cuda'))
+            self.one_size = latents.shape[0]
+        self.buffer = torch.cat(to_merge, dim=0)
+        shuffled_indices = torch.randperm(self.buffer.shape[0])
+        self.buffer = self.buffer[shuffled_indices]
+        self.pointer = 0
+    def iterate(self):
+        while True:
+            if self.buffer == None or self.buffer.shape[0] - self.pointer < self.num_in_buffer * self.one_size * 4 // 5:
+                try:
+                    to_retrieve = self.num_in_buffer if self.buffer is None else self.num_in_buffer // 5
+                    self.renew_buffer(to_retrieve)
+                except StopIteration:
+                    break
+            batch = self.buffer[self.pointer: self.pointer + self.batch_size]
+            self.pointer += self.batch_size
+            assert batch.shape[0] == self.batch_size
+            yield batch

SAE/sae.py ADDED Viewed

	@@ -0,0 +1,216 @@

+'''
+Adapted from
+https://github.com/openai/sparse_autoencoder/blob/main/sparse_autoencoder/model.py
+'''
+import torch
+import torch.nn as nn
+import os
+import json
+class SparseAutoencoder(nn.Module):
+    """
+    Top-K Autoencoder with sparse kernels. Implements:
+        latents = relu(topk(encoder(x - pre_bias) + latent_bias))
+        recons = decoder(latents) + pre_bias
+    """
+    def __init__(
+        self,
+        n_dirs_local: int,
+        d_model: int,
+        k: int,
+        auxk: int | None,
+        dead_steps_threshold: int,
+    ):
+        super().__init__()
+        self.n_dirs_local = n_dirs_local
+        self.d_model = d_model
+        self.k = k
+        self.auxk = auxk
+        self.dead_steps_threshold = dead_steps_threshold
+        self.encoder = nn.Linear(d_model, n_dirs_local, bias=False)
+        self.decoder = nn.Linear(n_dirs_local, d_model, bias=False)
+        self.pre_bias = nn.Parameter(torch.zeros(d_model))
+        self.latent_bias = nn.Parameter(torch.zeros(n_dirs_local))
+        self.stats_last_nonzero: torch.Tensor
+        self.register_buffer("stats_last_nonzero", torch.zeros(n_dirs_local, dtype=torch.long))
+        def auxk_mask_fn(x):
+            dead_mask = self.stats_last_nonzero > dead_steps_threshold
+            x.data *= dead_mask  # inplace to save memory
+            return x
+        self.auxk_mask_fn = auxk_mask_fn
+        ## initialization
+        # "tied" init
+        self.decoder.weight.data = self.encoder.weight.data.T.clone()
+        # store decoder in column major layout for kernel
+        self.decoder.weight.data = self.decoder.weight.data.T.contiguous().T
+        unit_norm_decoder_(self)
+    def save_to_disk(self, path: str):
+        PATH_TO_CFG = 'config.json'
+        PATH_TO_WEIGHTS = 'state_dict.pth'
+        cfg = {
+            "n_dirs_local": self.n_dirs_local,
+            "d_model": self.d_model,
+            "k": self.k,
+            "auxk": self.auxk,
+            "dead_steps_threshold": self.dead_steps_threshold,
+        }
+        os.makedirs(path, exist_ok=True)
+        with open(os.path.join(path, PATH_TO_CFG), 'w') as f:
+            json.dump(cfg, f)
+        torch.save({
+            "state_dict": self.state_dict(),
+        }, os.path.join(path, PATH_TO_WEIGHTS))
+    @classmethod
+    def load_from_disk(cls, path: str):
+        PATH_TO_CFG = 'config.json'
+        PATH_TO_WEIGHTS = 'state_dict.pth'
+        with open(os.path.join(path, PATH_TO_CFG), 'r') as f:
+            cfg = json.load(f)
+        ae = cls(
+            n_dirs_local=cfg["n_dirs_local"],
+            d_model=cfg["d_model"],
+            k=cfg["k"],
+            auxk=cfg["auxk"],
+            dead_steps_threshold=cfg["dead_steps_threshold"],
+        )
+        state_dict = torch.load(os.path.join(path, PATH_TO_WEIGHTS))["state_dict"]
+        ae.load_state_dict(state_dict)
+        return ae
+    @property
+    def n_dirs(self):
+        return self.n_dirs_local
+    def encode(self, x):
+        x = x - self.pre_bias
+        latents_pre_act = self.encoder(x) + self.latent_bias
+        vals, inds = torch.topk(
+            latents_pre_act,
+            k=self.k,
+            dim=-1
+        )
+        latents = torch.zeros_like(latents_pre_act)
+        latents.scatter_(-1, inds, torch.relu(vals))
+        return latents
+    def forward(self, x):
+        x = x - self.pre_bias
+        latents_pre_act = self.encoder(x) + self.latent_bias
+        vals, inds = torch.topk(
+            latents_pre_act,
+            k=self.k,
+            dim=-1
+        )
+        ## set num nonzero stat ##
+        tmp = torch.zeros_like(self.stats_last_nonzero)
+        tmp.scatter_add_(
+            0,
+            inds.reshape(-1),
+            (vals > 1e-3).to(tmp.dtype).reshape(-1),
+        )
+        self.stats_last_nonzero *= 1 - tmp.clamp(max=1)
+        self.stats_last_nonzero += 1
+        ## end stats ##
+        ## auxk
+        if self.auxk is not None:  # for auxk
+            # IMPORTANT: has to go after stats update!
+            # WARN: auxk_mask_fn can mutate latents_pre_act!
+            auxk_vals, auxk_inds = torch.topk(
+                self.auxk_mask_fn(latents_pre_act),
+                k=self.auxk,
+                dim=-1
+            )
+        else:
+            auxk_inds = None
+            auxk_vals = None
+        ## end auxk
+        vals = torch.relu(vals)
+        if auxk_vals is not None:
+            auxk_vals = torch.relu(auxk_vals)
+        rows, cols = latents_pre_act.size()
+        row_indices = torch.arange(rows).unsqueeze(1).expand(-1, self.k).reshape(-1)
+        vals = vals.reshape(-1)
+        inds = inds.reshape(-1)
+        indices = torch.stack([row_indices.to(inds.device), inds])
+        sparse_tensor = torch.sparse_coo_tensor(indices, vals, torch.Size([rows, cols]))
+        recons = torch.sparse.mm(sparse_tensor, self.decoder.weight.T) + self.pre_bias
+        return recons, {
+            "inds": inds,
+            "vals": vals,
+            "auxk_inds": auxk_inds,
+            "auxk_vals": auxk_vals,
+        }
+    def decode_sparse(self, inds, vals):
+        rows, cols = inds.shape[0], self.n_dirs
+        row_indices = torch.arange(rows).unsqueeze(1).expand(-1, inds.shape[1]).reshape(-1)
+        vals = vals.reshape(-1)
+        inds = inds.reshape(-1)
+        indices = torch.stack([row_indices.to(inds.device), inds])
+        sparse_tensor = torch.sparse_coo_tensor(indices, vals, torch.Size([rows, cols]))
+        recons = torch.sparse.mm(sparse_tensor, self.decoder.weight.T) + self.pre_bias
+        return recons
+    @property
+    def device(self):
+        return next(self.parameters()).device
+def unit_norm_decoder_(autoencoder: SparseAutoencoder) -> None:
+    """
+    Unit normalize the decoder weights of an autoencoder.
+    """
+    autoencoder.decoder.weight.data /= autoencoder.decoder.weight.data.norm(dim=0)
+def unit_norm_decoder_grad_adjustment_(autoencoder) -> None:
+    """project out gradient information parallel to the dictionary vectors - assumes that the decoder is already unit normed"""
+    assert autoencoder.decoder.weight.grad is not None
+    autoencoder.decoder.weight.grad +=\
+        torch.einsum("bn,bn->n", autoencoder.decoder.weight.data, autoencoder.decoder.weight.grad) *\
+        autoencoder.decoder.weight.data * -1

SAE/sae_utils.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import torch
+from dataclasses import dataclass, field
+@dataclass
+class SAETrainingConfig:
+    d_model: int
+    n_dirs: int
+    k: int
+    block_name: str
+    bs: int
+    save_path_base: str
+    auxk: int = 256
+    lr: float = 1e-4
+    eps: float = 6.25e-10
+    dead_toks_threshold: int = 10_000_000
+    auxk_coef: float = 1/32
+    @property
+    def sae_name(self):
+        return f'{self.block_name}_k{self.k}_hidden{self.n_dirs}_auxk{self.auxk}_bs{self.bs}_lr{self.lr}'
+    @property
+    def save_path(self):
+        return f'/dlabscratch1/surkov/sae_models/{self.block_name}_k{self.k}_hidden{self.n_dirs}_auxk{self.auxk}_bs{self.bs}_lr{self.lr}'
+@dataclass
+class Config:
+    saes: list[SAETrainingConfig]
+    paths_to_latents: list[str]
+    log_interval: int
+    save_interval: int
+    bs: int
+    block_name: str
+    wandb_project: str = 'sdxl_sae_train'
+    wandb_name: str = 'multiple_sae'
+    def __init__(self, cfg_json):
+        self.saes = [SAETrainingConfig(**sae_cfg, block_name=cfg_json['block_name'], bs=cfg_json['bs'], save_path_base=cfg_json['save_path_base'])
+                    for sae_cfg in cfg_json['sae_configs']]
+        self.save_path_base = cfg_json['save_path_base']
+        self.paths_to_latents = cfg_json['paths_to_latents']
+        self.log_interval = cfg_json['log_interval']
+        self.save_interval = cfg_json['save_interval']
+        self.bs = cfg_json['bs']
+        self.block_name = cfg_json['block_name']

SDLens/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .hooked_sd_pipeline import HookedIFPipeline, HookedStableDiffusionXLPipeline

SDLens/hooked_scheduler.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from diffusers import DDPMScheduler
+import torch
+class HookedNoiseScheduler:
+    scheduler: DDPMScheduler
+    pre_hooks: list
+    post_hooks: list
+    def __init__(self, scheduler):
+        object.__setattr__(self, 'scheduler', scheduler)
+        object.__setattr__(self, 'pre_hooks', [])
+        object.__setattr__(self, 'post_hooks', [])
+    def step(
+        self,
+        model_output, timestep, sample, generator, return_dict
+    ):
+        assert return_dict == False, "return_dict == True is not implemented"
+        for hook in self.pre_hooks:
+            hook_output = hook(model_output, timestep, sample, generator)
+            if hook_output is not None:
+                model_output, timestep, sample, generator = hook_output
+        (pred_prev_sample, ) = self.scheduler.step(model_output, timestep, sample, generator, return_dict)
+        for hook in self.post_hooks:
+            hook_output = hook(pred_prev_sample)
+            if hook_output is not None:
+                pred_prev_sample = hook_output
+        return (pred_prev_sample, )
+    def __getattr__(self, name):
+        return getattr(self.scheduler, name)
+    def __setattr__(self, name, value):
+        if name in {'scheduler', 'pre_hooks', 'post_hooks'}:
+            object.__setattr__(self, name, value)
+        else:
+            setattr(self.scheduler, name, value)

SDLens/hooked_sd_pipeline.py ADDED Viewed

	@@ -0,0 +1,319 @@

+import einops
+from diffusers import StableDiffusionXLPipeline, IFPipeline
+from typing import List, Dict, Callable, Union
+import torch
+from .hooked_scheduler import HookedNoiseScheduler
+def retrieve(io):
+    if isinstance(io, tuple):
+        if len(io) == 1:
+            return io[0]
+        else:
+            raise ValueError("A tuple should have length of 1")
+    elif isinstance(io, torch.Tensor):
+        return io
+    else:
+        raise ValueError("Input/Output must be a tensor, or 1-element tuple")
+class HookedDiffusionAbstractPipeline:
+    parent_cls = None
+    pipe = None
+    def __init__(self, pipe: parent_cls, use_hooked_scheduler: bool = False):
+        if use_hooked_scheduler:
+            pipe.scheduler = HookedNoiseScheduler(pipe.scheduler)
+        self.__dict__['pipe'] = pipe
+        self.use_hooked_scheduler = use_hooked_scheduler
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        return cls(cls.parent_cls.from_pretrained(*args, **kwargs))
+    def run_with_hooks(self,
+        *args,
+        position_hook_dict: Dict[str, Union[Callable, List[Callable]]],
+        **kwargs
+    ):
+        '''
+        Run the pipeline with hooks at specified positions.
+        Returns the final output.
+        Args:
+            *args: Arguments to pass to the pipeline.
+            position_hook_dict: A dictionary mapping positions to hooks.
+                The keys are positions in the pipeline where the hooks should be registered.
+                The values are either a single hook or a list of hooks to be registered at the specified position.
+                Each hook should be a callable that takes three arguments: (module, input, output).
+            **kwargs: Keyword arguments to pass to the pipeline.
+        '''
+        hooks = []
+        for position, hook in position_hook_dict.items():
+            if isinstance(hook, list):
+                for h in hook:
+                    hooks.append(self._register_general_hook(position, h))
+            else:
+                hooks.append(self._register_general_hook(position, hook))
+        hooks = [hook for hook in hooks if hook is not None]
+        try:
+            output = self.pipe(*args, **kwargs)
+        finally:
+            for hook in hooks:
+                hook.remove()
+            if self.use_hooked_scheduler:
+                self.pipe.scheduler.pre_hooks = []
+                self.pipe.scheduler.post_hooks = []
+        return output
+    def run_with_cache(self,
+        *args,
+        positions_to_cache: List[str],
+        save_input: bool = False,
+        save_output: bool = True,
+        **kwargs
+    ):
+        '''
+        Run the pipeline with caching at specified positions.
+        This method allows you to cache the intermediate inputs and/or outputs of the pipeline
+        at certain positions. The final output of the pipeline and a dictionary of cached values
+        are returned.
+        Args:
+            *args: Arguments to pass to the pipeline.
+            positions_to_cache (List[str]): A list of positions in the pipeline where intermediate
+                inputs/outputs should be cached.
+            save_input (bool, optional): If True, caches the input at each specified position.
+                Defaults to False.
+            save_output (bool, optional): If True, caches the output at each specified position.
+                Defaults to True.
+            **kwargs: Keyword arguments to pass to the pipeline.
+        Returns:
+            final_output: The final output of the pipeline after execution.
+            cache_dict (Dict[str, Dict[str, Any]]): A dictionary where keys are the specified positions
+                and values are dictionaries containing the cached 'input' and/or 'output' at each position,
+                depending on the flags `save_input` and `save_output`.
+        '''
+        cache_input, cache_output = dict() if save_input else None, dict() if save_output else None
+        hooks = [
+            self._register_cache_hook(position, cache_input, cache_output) for position in positions_to_cache
+        ]
+        hooks = [hook for hook in hooks if hook is not None]
+        output = self.pipe(*args, **kwargs)
+        for hook in hooks:
+            hook.remove()
+        if self.use_hooked_scheduler:
+            self.pipe.scheduler.pre_hooks = []
+            self.pipe.scheduler.post_hooks = []
+        cache_dict = {}
+        if save_input:
+            for position, block in cache_input.items():
+                cache_input[position] = torch.stack(block, dim=1)
+            cache_dict['input'] = cache_input
+        if save_output:
+            for position, block in cache_output.items():
+                cache_output[position] = torch.stack(block, dim=1)
+            cache_dict['output'] = cache_output
+        return output, cache_dict
+    def run_with_hooks_and_cache(self,
+        *args,
+        position_hook_dict: Dict[str, Union[Callable, List[Callable]]],
+        positions_to_cache: List[str] = [],
+        save_input: bool = False,
+        save_output: bool = True,
+        **kwargs
+    ):
+        '''
+        Run the pipeline with hooks and caching at specified positions.
+        This method allows you to register hooks at certain positions in the pipeline and
+        cache intermediate inputs and/or outputs at specified positions. Hooks can be used
+        for inspecting or modifying the pipeline's execution, and caching stores intermediate
+        values for later inspection or use.
+        Args:
+            *args: Arguments to pass to the pipeline.
+            position_hook_dict Dict[str, Union[Callable, List[Callable]]]:
+                A dictionary where the keys are the positions in the pipeline, and the values
+                are hooks (either a single hook or a list of hooks) to be registered at those positions.
+                Each hook should be a callable that accepts three arguments: (module, input, output).
+            positions_to_cache (List[str], optional): A list of positions in the pipeline where
+                intermediate inputs/outputs should be cached. Defaults to an empty list.
+            save_input (bool, optional): If True, caches the input at each specified position.
+                Defaults to False.
+            save_output (bool, optional): If True, caches the output at each specified position.
+                Defaults to True.
+            **kwargs: Additional keyword arguments to pass to the pipeline.
+        Returns:
+            final_output: The final output of the pipeline after execution.
+            cache_dict (Dict[str, Dict[str, Any]]): A dictionary where keys are the specified positions
+                and values are dictionaries containing the cached 'input' and/or 'output' at each position,
+                depending on the flags `save_input` and `save_output`.
+        '''
+        cache_input, cache_output = dict() if save_input else None, dict() if save_output else None
+        hooks = [
+            self._register_cache_hook(position, cache_input, cache_output) for position in positions_to_cache
+        ]
+        for position, hook in position_hook_dict.items():
+            if isinstance(hook, list):
+                for h in hook:
+                    hooks.append(self._register_general_hook(position, h))
+            else:
+                hooks.append(self._register_general_hook(position, hook))
+        hooks = [hook for hook in hooks if hook is not None]
+        output = self.pipe(*args, **kwargs)
+        for hook in hooks:
+            hook.remove()
+        if self.use_hooked_scheduler:
+            self.pipe.scheduler.pre_hooks = []
+            self.pipe.scheduler.post_hooks = []
+        cache_dict = {}
+        if save_input:
+            for position, block in cache_input.items():
+                cache_input[position] = torch.stack(block, dim=1)
+            cache_dict['input'] = cache_input
+        if save_output:
+            for position, block in cache_output.items():
+                cache_output[position] = torch.stack(block, dim=1)
+            cache_dict['output'] = cache_output
+        return output, cache_dict
+    def _locate_block(self, position: str):
+        '''
+        Locate the block at the specified position in the pipeline.
+        '''
+        block = self.pipe
+        for step in position.split('.'):
+            if step.isdigit():
+                step = int(step)
+                block = block[step]
+            else:
+                block = getattr(block, step)
+        return block
+    def _register_cache_hook(self, position: str, cache_input: Dict, cache_output: Dict):
+        if position.endswith('$self_attention') or position.endswith('$cross_attention'):
+            return self._register_cache_attention_hook(position, cache_output)
+        if position == 'noise':
+            def hook(model_output, timestep, sample, generator):
+                if position not in cache_output:
+                    cache_output[position] = []
+                cache_output[position].append(sample)
+            if self.use_hooked_scheduler:
+                self.pipe.scheduler.post_hooks.append(hook)
+            else:
+                raise ValueError('Cannot cache noise without using hooked scheduler')
+            return
+        block = self._locate_block(position)
+        def hook(module, input, kwargs, output):
+            if cache_input is not None:
+                if position not in cache_input:
+                    cache_input[position] = []
+                cache_input[position].append(retrieve(input))
+            if cache_output is not None:
+                if position not in cache_output:
+                    cache_output[position] = []
+                cache_output[position].append(retrieve(output))
+        return block.register_forward_hook(hook, with_kwargs=True)
+    def _register_cache_attention_hook(self, position, cache):
+        attn_block = self._locate_block(position.split('$')[0])
+        if position.endswith('$self_attention'):
+            attn_block = attn_block.attn1
+        elif position.endswith('$cross_attention'):
+            attn_block = attn_block.attn2
+        else:
+            raise ValueError('Wrong attention type')
+        def hook(module, args, kwargs, output):
+            hidden_states = args[0]
+            encoder_hidden_states = kwargs['encoder_hidden_states']
+            attention_mask = kwargs['attention_mask']
+            batch_size, sequence_length, _ = hidden_states.shape
+            attention_mask = attn_block.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            query = attn_block.to_q(hidden_states)
+            if encoder_hidden_states is None:
+                encoder_hidden_states = hidden_states
+            elif attn_block.norm_cross is not None:
+                encoder_hidden_states = attn_block.norm_cross(encoder_hidden_states)
+            key = attn_block.to_k(encoder_hidden_states)
+            value = attn_block.to_v(encoder_hidden_states)
+            query = attn_block.head_to_batch_dim(query)
+            key = attn_block.head_to_batch_dim(key)
+            value = attn_block.head_to_batch_dim(value)
+            attention_probs = attn_block.get_attention_scores(query, key, attention_mask)
+            attention_probs = attention_probs.view(
+                batch_size,
+                attention_probs.shape[0] // batch_size,
+                attention_probs.shape[1],
+                attention_probs.shape[2]
+            )
+            if position not in cache:
+                cache[position] = []
+            cache[position].append(attention_probs)
+        return attn_block.register_forward_hook(hook, with_kwargs=True)
+    def _register_general_hook(self, position, hook):
+        if position == 'scheduler_pre':
+            if not self.use_hooked_scheduler:
+                raise ValueError('Cannot register hooks on scheduler without using hooked scheduler')
+            self.pipe.scheduler.pre_hooks.append(hook)
+            return
+        elif position == 'scheduler_post':
+            if not self.use_hooked_scheduler:
+                raise ValueError('Cannot register hooks on scheduler without using hooked scheduler')
+            self.pipe.scheduler.post_hooks.append(hook)
+            return
+        block = self._locate_block(position)
+        return block.register_forward_hook(hook)
+    def to(self, *args, **kwargs):
+        self.pipe = self.pipe.to(*args, **kwargs)
+        return self
+    def __getattr__(self, name):
+        return getattr(self.pipe, name)
+    def __setattr__(self, name, value):
+        return setattr(self.pipe, name, value)
+    def __call__(self, *args, **kwargs):
+        return self.pipe(*args, **kwargs)
+class HookedStableDiffusionXLPipeline(HookedDiffusionAbstractPipeline):
+    parent_cls = StableDiffusionXLPipeline
+class HookedIFPipeline(HookedDiffusionAbstractPipeline):
+    parent_cls = IFPipeline

app.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

app.py ADDED Viewed

	@@ -0,0 +1,399 @@

+import gradio as gr
+import os
+import torch
+from PIL import Image
+from SDLens import HookedStableDiffusionXLPipeline
+from SAE import SparseAutoencoder
+from utils import add_feature_on_area
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.colors import ListedColormap
+from utils import add_feature_on_area, replace_with_feature
+import threading
+code_to_block = {
+    "down.2.1": "unet.down_blocks.2.attentions.1",
+    "mid.0": "unet.mid_block.attentions.0",
+    "up.0.1": "unet.up_blocks.0.attentions.1",
+    "up.0.0": "unet.up_blocks.0.attentions.0"
+}
+lock = threading.Lock()
+def process_cache(cache, saes_dict):
+    top_features_dict = {}
+    sparse_maps_dict = {}
+    for code in code_to_block.keys():
+        block = code_to_block[code]
+        sae = saes_dict[code]
+        diff = cache["output"][block] - cache["input"][block]
+        diff = diff.permute(0, 1, 3, 4, 2).squeeze(0).squeeze(0)
+        with torch.no_grad():
+            sparse_maps = sae.encode(diff)
+        averages = torch.mean(sparse_maps, dim=(0, 1))
+        top_features = torch.topk(averages, 10).indices
+        top_features_dict[code] = top_features.cpu().tolist()
+        sparse_maps_dict[code] = sparse_maps.cpu().numpy()
+    return top_features_dict, sparse_maps_dict
+def plot_image_heatmap(cache, block_select, radio):
+    code = block_select.split()[0]
+    feature = int(radio)
+    block = code_to_block[code]
+    heatmap = cache["heatmaps"][code][:, :, feature]
+    heatmap = np.kron(heatmap, np.ones((32, 32)))
+    image = cache["image"].convert("RGBA")
+    jet = plt.cm.jet
+    cmap = jet(np.arange(jet.N))
+    cmap[:1, -1] = 0
+    cmap[1:, -1] = 0.6
+    cmap = ListedColormap(cmap)
+    heatmap = (heatmap - np.min(heatmap)) / (np.max(heatmap) - np.min(heatmap))
+    heatmap_rgba = cmap(heatmap)
+    heatmap_image = Image.fromarray((heatmap_rgba * 255).astype(np.uint8))
+    heatmap_with_transparency = Image.alpha_composite(image, heatmap_image)
+    return heatmap_with_transparency
+def create_prompt_part(pipe, saes_dict, demo):
+    def image_gen(prompt):
+        lock.acquire()
+        try:
+            images, cache = pipe.run_with_cache(
+                prompt,
+                positions_to_cache=list(code_to_block.values()),
+                num_inference_steps=1,
+                generator=torch.Generator(device="cpu").manual_seed(42),
+                guidance_scale=0.0,
+                save_input=True,
+                save_output=True
+            )
+        finally:
+            lock.release()
+        top_features_dict, top_sparse_maps_dict = process_cache(cache, saes_dict)
+        return images.images[0], {
+            "image": images.images[0],
+            "heatmaps": top_sparse_maps_dict,
+            "features": top_features_dict
+        }
+    def update_radio(cache, block_select):
+        code = block_select.split()[0]
+        return gr.update(choices=cache["features"][code])
+    def update_img(cache, block_select, radio):
+        new_img = plot_image_heatmap(cache, block_select, radio)
+        return new_img
+    with gr.Tab("Explore", elem_classes="tabs") as explore_tab:
+        cache = gr.State(value={
+            "image": None,
+            "heatmaps": None,
+            "features": []
+        })
+        with gr.Row():
+            with gr.Column(scale=7):
+                with gr.Row(equal_height=True):
+                    prompt_field = gr.Textbox(lines=1, label="Enter prompt here", value="A cinematic shot of a professor sloth wearing a tuxedo at a BBQ party and eathing a dish with peas.")
+                    button = gr.Button("Generate", elem_classes="generate_button1")
+                with gr.Row():
+                    image = gr.Image(width=512, height=512, image_mode="RGB", label="Generated image")
+            with gr.Column(scale=4):
+                block_select = gr.Dropdown(
+                    choices=["up.0.1 (style)", "down.2.1 (composition)", "up.0.0 (details)", "mid.0"],
+                    value="down.2.1 (composition)",
+                    label="Select block",
+                    elem_id="block_select",
+                    interactive=True
+                )
+                radio = gr.Radio(choices=[], label="Select a feature", interactive=True)
+        button.click(image_gen, [prompt_field], outputs=[image, cache])
+        cache.change(update_radio, [cache, block_select], outputs=[radio])
+        block_select.select(update_radio, [cache, block_select], outputs=[radio])
+        radio.select(update_img, [cache, block_select, radio], outputs=[image])
+        demo.load(image_gen, [prompt_field], outputs=[image, cache])
+    return explore_tab
+def downsample_mask(image, factor):
+    downsampled = image.reshape(
+        (image.shape[0] // factor, factor,
+        image.shape[1] // factor, factor)
+    )
+    downsampled = downsampled.mean(axis=(1, 3))
+    return downsampled
+def create_intervene_part(pipe: HookedStableDiffusionXLPipeline, saes_dict, means_dict, demo):
+    def image_gen(prompt, num_steps):
+        lock.acquire()
+        try:
+            images = pipe.run_with_hooks(
+                prompt,
+                position_hook_dict={},
+                num_inference_steps=num_steps,
+                generator=torch.Generator(device="cpu").manual_seed(42),
+                guidance_scale=0.0
+            )
+        finally:
+            lock.release()
+        return images.images[0]
+    def image_mod(prompt, block_str, brush_index, strength, num_steps, input_image):
+        block = block_str.split(" ")[0]
+        mask = (input_image["layers"][0] > 0)[:, :, -1].astype(float)
+        mask = downsample_mask(mask, 32)
+        mask = torch.tensor(mask, dtype=torch.float32, device="cuda")
+        if mask.sum() == 0:
+            gr.Info("No mask selected, please draw on the input image")
+        def hook(module, input, output):
+            return add_feature_on_area(
+                saes_dict[block],
+                brush_index,
+                mask * means_dict[block][brush_index] * strength,
+                module,
+                input,
+                output
+            )
+        lock.acquire()
+        try:
+            image = pipe.run_with_hooks(
+                prompt,
+                position_hook_dict={code_to_block[block]: hook},
+                num_inference_steps=num_steps,
+                generator=torch.Generator(device="cpu").manual_seed(42),
+                guidance_scale=0.0
+            ).images[0]
+        finally:
+            lock.release()
+        return image
+    def feature_icon(block_str, brush_index):
+        block = block_str.split(" ")[0]
+        if block in ["mid.0", "up.0.0"]:
+            gr.Info("Note that Feature Icon works best with down.2.1 and up.0.1 blocks but feel free to explore", duration=3)
+        def hook(module, input, output):
+            return replace_with_feature(
+                saes_dict[block],
+                brush_index,
+                means_dict[block][brush_index] * saes_dict[block].k,
+                module,
+                input,
+                output
+            )
+        lock.acquire()
+        try:
+            image = pipe.run_with_hooks(
+                "",
+                position_hook_dict={code_to_block[block]: hook},
+                num_inference_steps=1,
+                generator=torch.Generator(device="cpu").manual_seed(42),
+                guidance_scale=0.0
+            ).images[0]
+        finally:
+            lock.release()
+        return image
+    with gr.Tab("Paint!", elem_classes="tabs") as intervene_tab:
+        image_state = gr.State(value=None)
+        with gr.Row():
+            with gr.Column(scale=3):
+                # Generation column
+                with gr.Row():
+                    # prompt and num_steps
+                    prompt_field = gr.Textbox(lines=1, label="Enter prompt here", value="A dog plays with a ball, cartoon", elem_id="prompt_input")
+                    num_steps = gr.Number(value=1, label="Number of steps", minimum=1, maximum=4, elem_id="num_steps", precision=0)
+                with gr.Row():
+                    # Generate button
+                    button_generate = gr.Button("Generate", elem_id="generate_button")
+            with gr.Column(scale=3):
+                # Intervention column
+                with gr.Row():
+                    # dropdowns and number inputs
+                    with gr.Column(scale=7):
+                        with gr.Row():
+                            block_select = gr.Dropdown(
+                                choices=["up.0.1 (style)", "down.2.1 (composition)", "up.0.0 (details)", "mid.0"],
+                                value="down.2.1 (composition)",
+                                label="Select block",
+                                elem_id="block_select"
+                            )
+                            brush_index = gr.Number(value=0, label="Brush index", minimum=0, maximum=5119, elem_id="brush_index", precision=0)
+                        with gr.Row():
+                            button_icon = gr.Button('Feature Icon', elem_id="feature_icon_button")
+                    with gr.Column(scale=3):
+                        with gr.Row():
+                            strength = gr.Number(value=10, label="Strength", minimum=-40, maximum=40, elem_id="strength", precision=2)
+                        with gr.Row():
+                            button = gr.Button('Apply', elem_id="apply_button")
+        with gr.Row():
+            with gr.Column():
+                # Input image
+                i_image = gr.Sketchpad(
+                    height=610,
+                    layers=False, transforms=[], placeholder="Generate and paint!",
+                    brush=gr.Brush(default_size=64, color_mode="fixed", colors=['black']),
+                    container=False,
+                    canvas_size=(512, 512),
+                    label="Input Image")
+                clear_button = gr.Button("Clear")
+                clear_button.click(lambda x: x, [image_state], [i_image])
+            # Output image
+            o_image = gr.Image(width=512, height=512, label="Output Image")
+        # Set up the click events
+        button_generate.click(image_gen, inputs=[prompt_field, num_steps], outputs=[image_state])
+        image_state.change(lambda x: x, [image_state], [i_image])
+        button.click(image_mod,
+                    inputs=[prompt_field, block_select, brush_index, strength, num_steps, i_image],
+                    outputs=o_image)
+        button_icon.click(feature_icon, inputs=[block_select, brush_index], outputs=o_image)
+        demo.load(image_gen, [prompt_field, num_steps], outputs=[image_state])
+    return intervene_tab
+def create_top_images_part(demo):
+    def update_top_images(block_select, brush_index):
+        block = block_select.split(" ")[0]
+        url = f"https://huggingface.co/surokpro2/sdxl_sae_images/resolve/main/{block}/{brush_index}.jpg"
+        return url
+    with gr.Tab("Top Images", elem_classes="tabs") as top_images_tab:
+        with gr.Row():
+            block_select = gr.Dropdown(
+                choices=["up.0.1 (style)", "down.2.1 (composition)", "up.0.0 (details)", "mid.0"],
+                value="down.2.1 (composition)",
+                label="Select block"
+            )
+            brush_index = gr.Number(value=0, label="Brush index", minimum=0, maximum=5119, precision=0)
+        with gr.Row():
+            image = gr.Image(width=600, height=600, label="Top Images")
+        block_select.select(update_top_images, [block_select, brush_index], outputs=[image])
+        brush_index.change(update_top_images, [block_select, brush_index], outputs=[image])
+        demo.load(update_top_images, [block_select, brush_index], outputs=[image])
+    return top_images_tab
+def create_intro_part():
+    with gr.Tab("Instructions", elem_classes="tabs") as intro_tab:
+        gr.Markdown(
+            '''# Unpacking SDXL Turbo with Sparse Autoencoders
+            ## Demo Overview
+            This demo showcases the use of Sparse Autoencoders (SAEs) to understand the features learned by the Stable Diffusion XL Turbo model.
+            ## How to Use
+            ### Explore
+            * Enter a prompt in the text box and click on the "Generate" button to generate an image.
+            * You can observe the active features in different blocks plot on top of the generated image.
+            ### Top Images
+            * For each feature, you can view the top images that activate the feature the most.
+            ### Paint!
+            * Generate an image using the prompt.
+            * Paint on the generated image to apply interventions.
+            * Use the "Feature Icon" button to understand how the selected brush functions.
+            ### Remarks
+            * Not all brushes mix well with all images. Experiment with different brushes and strengths.
+            * Feature Icon works best with `down.2.1 (composition)` and `up.0.1 (style)` blocks.
+            * This demo is provided for research purposes only. We do not take responsibility for the content generated by the demo.
+            ### Interesting features to try
+            To get started, try the following features:
+            - down.2.1 (composition): 2301 (evil) 3747 (image frame) 4998 (cartoon)
+            - up.0.1 (style): 4977 (tiger stripes) 90 (fur) 2615 (twilight blur)
+            '''
+        )
+    return intro_tab
+def create_demo(pipe, saes_dict, means_dict):
+    custom_css = """
+    .tabs button {
+        font-size: 20px !important; /* Adjust font size for tab text */
+        padding: 10px !important;   /* Adjust padding to make the tabs bigger */
+        font-weight: bold !important; /* Adjust font weight to make the text bold */
+    }
+    .generate_button1 {
+        max-width: 160px !important;
+        margin-top: 20px !important;
+        margin-bottom: 20px !important;
+    }
+    """
+    with gr.Blocks(css=custom_css) as demo:
+        with create_intro_part():
+            pass
+        with create_prompt_part(pipe, saes_dict, demo):
+            pass
+        with create_top_images_part(demo):
+            pass
+        with create_intervene_part(pipe, saes_dict, means_dict, demo):
+            pass
+    return demo
+if __name__ == "__main__":
+    import os
+    import gradio as gr
+    import torch
+    from SDLens import HookedStableDiffusionXLPipeline
+    from SAE import SparseAutoencoder
+    dtype=torch.float32
+    pipe = HookedStableDiffusionXLPipeline.from_pretrained(
+        'stabilityai/sdxl-turbo',
+        torch_dtype=dtype,
+        device_map="balanced",
+        variant=("fp16" if dtype==torch.float16 else None)
+    )
+    pipe.set_progress_bar_config(disable=True)
+    path_to_checkpoints = './checkpoints/'
+    code_to_block = {
+        "down.2.1": "unet.down_blocks.2.attentions.1",
+        "mid.0": "unet.mid_block.attentions.0",
+        "up.0.1": "unet.up_blocks.0.attentions.1",
+        "up.0.0": "unet.up_blocks.0.attentions.0"
+    }
+    saes_dict = {}
+    means_dict = {}
+    for code, block in code_to_block.items():
+        sae = SparseAutoencoder.load_from_disk(
+            os.path.join(path_to_checkpoints, f"{block}_k10_hidden5120_auxk256_bs4096_lr0.0001", "final"),
+        )
+        means = torch.load(
+            os.path.join(path_to_checkpoints, f"{block}_k10_hidden5120_auxk256_bs4096_lr0.0001", "final", "mean.pt"),
+            weights_only=True
+        )
+        saes_dict[code] = sae.to('cuda', dtype=dtype)
+        means_dict[code] = means.to('cuda', dtype=dtype)
+    demo = create_demo(pipe, saes_dict, means_dict)
+    demo.launch()

checkpoints/unet.down_blocks.2.attentions.1_k10_hidden5120_auxk256_bs4096_lr0.0001/final/config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"n_dirs_local": 5120, "d_model": 1280, "k": 10, "auxk": 256, "dead_steps_threshold": 2441}

checkpoints/unet.down_blocks.2.attentions.1_k10_hidden5120_auxk256_bs4096_lr0.0001/final/mean.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:387f2b6f8c4e4a6f1227921f28f00dfa4beb2bd4e422b7eb592cd8627af0e58f
+size 21581

checkpoints/unet.down_blocks.2.attentions.1_k10_hidden5120_auxk256_bs4096_lr0.0001/final/state_dict.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:39e3c6d17aa572a53368ca8ba8f82757947a3caf14fe654e84b175d0dc0a4650
+size 52497831

checkpoints/unet.down_blocks.2.attentions.1_k10_hidden5120_auxk256_bs4096_lr0.0001/final/std.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c6ca694c9504a7a8aa827004d3fdec5c1cb8fcf3904acc3562d1861fc6e65c19
+size 21576

checkpoints/unet.mid_block.attentions.0_k10_hidden5120_auxk256_bs4096_lr0.0001/final/config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"n_dirs_local": 5120, "d_model": 1280, "k": 10, "auxk": 256, "dead_steps_threshold": 2441}

checkpoints/unet.mid_block.attentions.0_k10_hidden5120_auxk256_bs4096_lr0.0001/final/mean.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:80790481d0e56ac3fa36599703cee7a05cfb4cc078db57c8f9180e860c330e1d
+size 21581

checkpoints/unet.mid_block.attentions.0_k10_hidden5120_auxk256_bs4096_lr0.0001/final/state_dict.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:49d38d9178c2a2780e04a5482a2feb9548c6e9a636ed1bf85291acf42e0ffa34
+size 52497831

checkpoints/unet.mid_block.attentions.0_k10_hidden5120_auxk256_bs4096_lr0.0001/final/std.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb6bfc7ce5e596f8aa048ab262ca56841868c222bf07eb2ed35b6e4f7094fea6
+size 21576

checkpoints/unet.up_blocks.0.attentions.0_k10_hidden5120_auxk256_bs4096_lr0.0001/final/config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"n_dirs_local": 5120, "d_model": 1280, "k": 10, "auxk": 256, "dead_steps_threshold": 2441}

checkpoints/unet.up_blocks.0.attentions.0_k10_hidden5120_auxk256_bs4096_lr0.0001/final/mean.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de036d0fb9ee663f7bdf60e4a5d89d038516dae637531676b53ff75d05eab46b
+size 21581

checkpoints/unet.up_blocks.0.attentions.0_k10_hidden5120_auxk256_bs4096_lr0.0001/final/state_dict.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:14c45efd9cce0258f014c49babdcd0e9ce8b266fe31eed72db1a45b990a1a0f8
+size 52497831

checkpoints/unet.up_blocks.0.attentions.0_k10_hidden5120_auxk256_bs4096_lr0.0001/final/std.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cb9c04499ccae041987cc262894e254c2f04288857a8a0470cfb1b86a8ecfa09
+size 21576

checkpoints/unet.up_blocks.0.attentions.1_k10_hidden5120_auxk256_bs4096_lr0.0001/final/config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"n_dirs_local": 5120, "d_model": 1280, "k": 10, "auxk": 256, "dead_steps_threshold": 2441}

checkpoints/unet.up_blocks.0.attentions.1_k10_hidden5120_auxk256_bs4096_lr0.0001/final/mean.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:96dbf6fffe9d62c3b3352f8e4fe48c54dfd69906cf8ad6828d5ce93db9a5f0dc
+size 21581

checkpoints/unet.up_blocks.0.attentions.1_k10_hidden5120_auxk256_bs4096_lr0.0001/final/state_dict.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f8eed82f4bcb2f010ae9075f10a1ece801ee3dec46dba7fadccc35f6c0a7836b
+size 52497831

checkpoints/unet.up_blocks.0.attentions.1_k10_hidden5120_auxk256_bs4096_lr0.0001/final/std.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe5c5be0c4c2d2b57e7888319053cb64929559f947c8ce445ddd6a397302afab
+size 21576

example.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+diffusers==0.29.2
+gradio==4.44.1
+torch>=2.4.0
+numpy
+matplotlib
+pillow
+wandb

resourses/image.png ADDED Viewed

Git LFS Details

SHA256: 86594c5876d61a3eac5238b739eeec41418995c7696b6453d70b4e683ebd82df
Pointer size: 132 Bytes
Size of remote file: 1.12 MB

scripts/collect_latents_dataset.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import os
+import sys
+import io
+import tarfile
+import torch
+import webdataset as wds
+import numpy as np
+from tqdm import tqdm
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from SDLens.hooked_sd_pipeline import HookedStableDiffusionXLPipeline
+import datetime
+from datasets import load_dataset
+from torch.utils.data import DataLoader
+import diffusers
+import fire
+def main(save_path, start_at=0, finish_at=30000, dataset_batch_size=50):
+    blocks_to_save = [
+        'unet.down_blocks.2.attentions.1',
+        'unet.mid_block.attentions.0',
+        'unet.up_blocks.0.attentions.0',
+        'unet.up_blocks.0.attentions.1',
+    ]
+    # Initialization
+    dataset = load_dataset("guangyil/laion-coco-aesthetic", split="train", columns=["caption"], streaming=True).shuffle(seed=42)
+    pipe = HookedStableDiffusionXLPipeline.from_pretrained('stabilityai/sdxl-turbo')
+    pipe.to('cuda')
+    pipe.set_progress_bar_config(disable=True)
+    dataloader = DataLoader(dataset, batch_size=dataset_batch_size)
+    ct = datetime.datetime.now()
+    save_path = os.path.join(save_path, str(ct))
+    # Collecting dataset
+    os.makedirs(save_path, exist_ok=True)
+    writers = {
+        block: wds.TarWriter(f'{save_path}/{block}.tar') for block in blocks_to_save
+    }
+    writers.update({'images': wds.TarWriter(f'{save_path}/images.tar')})
+    def to_kwargs(kwargs_to_save):
+        kwargs = kwargs_to_save.copy()
+        seed = kwargs['seed']
+        del kwargs['seed']
+        kwargs['generator'] = torch.Generator(device="cpu").manual_seed(num_document)
+        return kwargs
+    dataloader_iter = iter(dataloader)
+    for num_document, batch in tqdm(enumerate(dataloader)):
+        if num_document < start_at:
+            continue
+        if num_document >= finish_at:
+            break
+        kwargs_to_save = {
+            'prompt': batch['caption'],
+            'positions_to_cache': blocks_to_save,
+            'save_input': True,
+            'save_output': True,
+            'num_inference_steps': 1,
+            'guidance_scale': 0.0,
+            'seed': num_document,
+            'output_type': 'pil'
+        }
+        kwargs = to_kwargs(kwargs_to_save)
+        output, cache = pipe.run_with_cache(
+            **kwargs
+        )
+        blocks = cache['input'].keys()
+        for block in blocks:
+            sample = {
+                "__key__": f"sample_{num_document}",
+                "output.pth": cache['output'][block],
+                "diff.pth": cache['output'][block] - cache['input'][block],
+                "gen_args.json": kwargs_to_save
+            }
+            writers[block].write(sample)
+        writers['images'].write({
+            "__key__": f"sample_{num_document}",
+            "images.npy": np.stack(output.images)
+        })
+    for block, writer in writers.items():
+        writer.close()
+if __name__ == '__main__':
+    fire.Fire(main)

scripts/train_sae.py ADDED Viewed

	@@ -0,0 +1,308 @@

+'''
+Adapted from
+https://github.com/openai/sparse_autoencoder/blob/main/sparse_autoencoder/train.py
+'''
+import os
+import sys
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from typing import Callable, Iterable, Iterator
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.distributed import ReduceOp
+from SAE.dataset_iterator import ActivationsDataloader
+from SAE.sae import SparseAutoencoder, unit_norm_decoder_, unit_norm_decoder_grad_adjustment_
+from SAE.sae_utils import SAETrainingConfig, Config
+from types import SimpleNamespace
+from typing import Optional, List
+import json
+import tqdm
+def weighted_average(points: torch.Tensor, weights: torch.Tensor):
+    weights = weights / weights.sum()
+    return (points * weights.view(-1, 1)).sum(dim=0)
+@torch.no_grad()
+def geometric_median_objective(
+    median: torch.Tensor, points: torch.Tensor, weights: torch.Tensor
+) -> torch.Tensor:
+    norms = torch.linalg.norm(points - median.view(1, -1), dim=1)  # type: ignore
+    return (norms * weights).sum()
+def compute_geometric_median(
+    points: torch.Tensor,
+    weights: Optional[torch.Tensor] = None,
+    eps: float = 1e-6,
+    maxiter: int = 100,
+    ftol: float = 1e-20,
+    do_log: bool = False,
+):
+    """
+    :param points: ``torch.Tensor`` of shape ``(n, d)``
+    :param weights: Optional ``torch.Tensor`` of shape :math:``(n,)``.
+    :param eps: Smallest allowed value of denominator, to avoid divide by zero.
+        Equivalently, this is a smoothing parameter. Default 1e-6.
+    :param maxiter: Maximum number of Weiszfeld iterations. Default 100
+    :param ftol: If objective value does not improve by at least this `ftol` fraction, terminate the algorithm. Default 1e-20.
+    :param do_log: If true will return a log of function values encountered through the course of the algorithm
+    :return: SimpleNamespace object with fields
+        - `median`: estimate of the geometric median, which is a ``torch.Tensor`` object of shape :math:``(d,)``
+        - `termination`: string explaining how the algorithm terminated.
+        - `logs`: function values encountered through the course of the algorithm in a list (None if do_log is false).
+    """
+    with torch.no_grad():
+        if weights is None:
+            weights = torch.ones((points.shape[0],), device=points.device)
+        # initialize median estimate at mean
+        new_weights = weights
+        median = weighted_average(points, weights)
+        objective_value = geometric_median_objective(median, points, weights)
+        if do_log:
+            logs = [objective_value]
+        else:
+            logs = None
+        # Weiszfeld iterations
+        early_termination = False
+        pbar = tqdm.tqdm(range(maxiter))
+        for _ in pbar:
+            prev_obj_value = objective_value
+            norms = torch.linalg.norm(points - median.view(1, -1), dim=1)  # type: ignore
+            new_weights = weights / torch.clamp(norms, min=eps)
+            median = weighted_average(points, new_weights)
+            objective_value = geometric_median_objective(median, points, weights)
+            if logs is not None:
+                logs.append(objective_value)
+            if abs(prev_obj_value - objective_value) <= ftol * objective_value:
+                early_termination = True
+                break
+            pbar.set_description(f"Objective value: {objective_value:.4f}")
+    median = weighted_average(points, new_weights)  # allow autodiff to track it
+    return SimpleNamespace(
+        median=median,
+        new_weights=new_weights,
+        termination=(
+            "function value converged within tolerance"
+            if early_termination
+            else "maximum iterations reached"
+        ),
+        logs=logs,
+    )
+def maybe_transpose(x):
+    return x.T if not x.is_contiguous() and x.T.is_contiguous() else x
+import wandb
+RANK = 0
+class Logger:
+    def __init__(self, sae_name, **kws):
+        self.vals = {}
+        self.enabled = (RANK == 0) and not kws.pop("dummy", False)
+        self.sae_name = sae_name
+    def logkv(self, k, v):
+        if self.enabled:
+            self.vals[f'{self.sae_name}/{k}'] = v.detach() if isinstance(v, torch.Tensor) else v
+        return v
+    def dumpkvs(self, step):
+        if self.enabled:
+            wandb.log(self.vals, step=step)
+            self.vals = {}
+class FeaturesStats:
+    def __init__(self, dim, logger):
+        self.dim = dim
+        self.logger = logger
+        self.reinit()
+    def reinit(self):
+        self.n_activated = torch.zeros(self.dim, dtype=torch.long, device="cuda")
+        self.n = 0
+    def update(self, inds):
+        self.n += inds.shape[0]
+        inds = inds.flatten().detach()
+        self.n_activated.scatter_add_(0, inds, torch.ones_like(inds))
+    def log(self):
+        self.logger.logkv('activated', (self.n_activated / self.n + 1e-9).log10().cpu().numpy())
+def training_loop_(
+    aes,
+    train_acts_iter,
+    loss_fn,
+    log_interval,
+    save_interval,
+    loggers,
+    sae_cfgs,
+):
+    sae_packs = []
+    for ae, cfg, logger in zip(aes, sae_cfgs, loggers):
+        pbar = tqdm.tqdm(unit=" steps", desc="Training Loss: ")
+        fstats = FeaturesStats(ae.n_dirs, logger)
+        opt = torch.optim.Adam(ae.parameters(), lr=cfg.lr, eps=cfg.eps, fused=True)
+        sae_packs.append((ae, cfg, logger, pbar, fstats, opt))
+    for i, flat_acts_train_batch in enumerate(train_acts_iter):
+        flat_acts_train_batch = flat_acts_train_batch.cuda()
+        for ae, cfg, logger, pbar, fstats, opt in sae_packs:
+            recons, info = ae(flat_acts_train_batch)
+            loss = loss_fn(ae, cfg, flat_acts_train_batch, recons, info, logger)
+            fstats.update(info['inds'])
+            bs = flat_acts_train_batch.shape[0]
+            logger.logkv('not-activated 1e4', (ae.stats_last_nonzero > 1e4 / bs).mean(dtype=float).item())
+            logger.logkv('not-activated 1e6', (ae.stats_last_nonzero > 1e6 / bs).mean(dtype=float).item())
+            logger.logkv('not-activated 1e7', (ae.stats_last_nonzero > 1e7 / bs).mean(dtype=float).item())
+            logger.logkv('explained variance', explained_variance(recons, flat_acts_train_batch))
+            logger.logkv('l2_div', (torch.linalg.norm(recons, dim=1) / torch.linalg.norm(flat_acts_train_batch, dim=1)).mean())
+            if (i + 1) % log_interval == 0:
+                fstats.log()
+                fstats.reinit()
+            if (i + 1) % save_interval == 0:
+                ae.save_to_disk(f"{cfg.save_path}/{i + 1}")
+            loss.backward()
+            unit_norm_decoder_(ae)
+            unit_norm_decoder_grad_adjustment_(ae)
+            opt.step()
+            opt.zero_grad()
+            logger.dumpkvs(i)
+            pbar.set_description(f"Training Loss {loss.item():.4f}")
+            pbar.update(1)
+    for ae, cfg, logger, pbar, fstats, opt in sae_packs:
+        pbar.close()
+        ae.save_to_disk(f"{cfg.save_path}/final")
+def init_from_data_(ae, stats_acts_sample):
+    ae.pre_bias.data = (
+        compute_geometric_median(stats_acts_sample[:32768].float().cpu()).median.cuda().float()
+    )
+def mse(recons, x):
+    # return ((recons - x) ** 2).sum(dim=-1).mean()
+    return ((recons - x) ** 2).mean()
+def normalized_mse(recon: torch.Tensor, xs: torch.Tensor) -> torch.Tensor:
+    # only used for auxk
+    xs_mu = xs.mean(dim=0)
+    loss = mse(recon, xs) / mse(
+        xs_mu[None, :].broadcast_to(xs.shape), xs
+    )
+    return loss
+def explained_variance(recons, x):
+    # Compute the variance of the difference
+    diff = x - recons
+    diff_var = torch.var(diff, dim=0, unbiased=False)
+    # Compute the variance of the original tensor
+    x_var = torch.var(x, dim=0, unbiased=False)
+    # Avoid division by zero
+    explained_var = 1 - diff_var / (x_var + 1e-8)
+    return explained_var.mean()
+def main():
+    cfg = Config(json.load(open('SAE/config.json')))
+    dataloader = ActivationsDataloader(cfg.paths_to_latents, cfg.block_name, cfg.bs)
+    acts_iter = dataloader.iterate()
+    stats_acts_sample = torch.cat([
+        next(acts_iter).cpu() for _ in range(10)
+    ], dim=0)
+    aes = [
+        SparseAutoencoder(
+            n_dirs_local=sae.n_dirs,
+            d_model=sae.d_model,
+            k=sae.k,
+            auxk=sae.auxk,
+            dead_steps_threshold=sae.dead_toks_threshold // cfg.bs,
+        ).cuda()
+        for sae in cfg.saes
+    ]
+    for ae in aes:
+        init_from_data_(ae, stats_acts_sample)
+    mse_scale = (
+        1 / ((stats_acts_sample.float().mean(dim=0) - stats_acts_sample.float()) ** 2).mean()
+    )
+    mse_scale = mse_scale.item()
+    del stats_acts_sample
+    wandb.init(
+        project=cfg.wandb_project,
+        name=cfg.wandb_name,
+    )
+    loggers = [Logger(
+        sae_name=cfg_sae.sae_name,
+        dummy=False,
+    ) for cfg_sae in cfg.saes]
+    training_loop_(
+        aes,
+        acts_iter,
+        lambda ae, cfg_sae, flat_acts_train_batch, recons, info, logger: (
+            # MSE
+            logger.logkv("train_recons", mse_scale * mse(recons, flat_acts_train_batch))
+            # AuxK
+            + logger.logkv(
+                "train_maxk_recons",
+                cfg_sae.auxk_coef
+                * normalized_mse(
+                    ae.decode_sparse(
+                        info["auxk_inds"],
+                        info["auxk_vals"],
+                    ),
+                    flat_acts_train_batch - recons.detach() + ae.pre_bias.detach(),
+                ).nan_to_num(0),
+            )
+        ),
+        sae_cfgs = cfg.saes,
+        loggers=loggers,
+        log_interval=cfg.log_interval,
+        save_interval=cfg.save_interval,
+    )
+if __name__ == "__main__":
+    main()

utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .hooks import *

utils/hooks.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import torch
+@torch.no_grad()
+def add_feature(sae, feature_idx, value, module, input, output):
+    diff = (output[0] - input[0]).permute((0, 2, 3, 1)).to(sae.device)
+    activated = sae.encode(diff)
+    mask = torch.zeros_like(activated, device=diff.device)
+    mask[..., feature_idx] = value
+    to_add = mask @ sae.decoder.weight.T
+    return (output[0] + to_add.permute(0, 3, 1, 2).to(output[0].device),)
+@torch.no_grad()
+def add_feature_on_area(sae, feature_idx, activation_map, module, input, output):
+    diff = (output[0] - input[0]).permute((0, 2, 3, 1)).to(sae.device)
+    activated = sae.encode(diff)
+    mask = torch.zeros_like(activated, device=diff.device)
+    if len(activation_map) == 2:
+        activation_map = activation_map.unsqueeze(0)
+    mask[..., feature_idx] = mask[..., feature_idx] = activation_map.to(mask.device)
+    to_add = mask @ sae.decoder.weight.T
+    return (output[0] + to_add.permute(0, 3, 1, 2).to(output[0].device),)
+@torch.no_grad()
+def replace_with_feature(sae, feature_idx, value, module, input, output):
+    diff = (output[0] - input[0]).permute((0, 2, 3, 1)).to(sae.device)
+    activated = sae.encode(diff)
+    mask = torch.zeros_like(activated, device=diff.device)
+    mask[..., feature_idx] = value
+    to_add = mask @ sae.decoder.weight.T
+    return (input[0] + to_add.permute(0, 3, 1, 2).to(output[0].device),)
+@torch.no_grad()
+def reconstruct_sae_hook(sae, module, input, output):
+    diff = (output[0] - input[0]).permute((0, 2, 3, 1)).to(sae.device)
+    activated = sae.encode(diff)
+    reconstructed = sae.decoder(activated) + sae.pre_bias
+    return (input[0] + reconstructed.permute(0, 3, 1, 2).to(output[0].device),)
+@torch.no_grad()
+def ablate_block(module, input, output):
+    return input