Spaces:

helblazer811
/

ConceptAttention

Running on Zero

App Files Files Community

ConceptAttention / concept_attention /binary_segmentation_baselines /daam_flux.py

helblazer811

"Orphan branch commit with a readme"

55866f4 12 days ago

raw

history blame contribute delete

3.16 kB

	"""
	Here we reproduce DAAM, but for Flux DiT models. This is effectively a visualization of the cross attention
	layers of a Flux model.
	"""
	from torch import nn
	import torch
	import einops

	from concept_attention.image_generator import FluxGenerator
	from concept_attention.segmentation import SegmentationAbstractClass

	class DAAM(nn.Module):

	def __init__(
	self,
	model_name: str = "flux-schnell",
	device: str = "cuda",
	offload: bool = True,
	):
	"""
	Initialize the DAAM model.
	"""
	super(DAAM, self).__init__()
	# Load up the flux generator
	self.generator = FluxGenerator(
	model_name=model_name,
	device=device,
	offload=offload,
	)
	# Unpack the tokenizer
	self.tokenizer = self.generator.t5.tokenizer

	def __call__(
	self,
	prompt,
	seed=4,
	num_steps=4,
	timesteps=None,
	layers=None
	):
	"""
	Generate cross attention heatmap visualizations.

	Args:
	- prompt: str, the prompt to generate the visualizations for
	- seed: int, the seed to use for the visualization

	Returns:
	- attention_maps: torch.Tensor, the attention maps for the prompt
	- tokens: list[str], the tokens in the prompt
	- image: torch.Tensor, the image generated by the
	"""
	if timesteps is None:
	timesteps = list(range(num_steps))
	if layers is None:
	layers = list(range(19))
	# Run the tokenizer and get list of the tokens
	token_strings = self.tokenizer.tokenize(prompt)
	# Run the image generator
	image = self.generator.generate_image(
	width=1024,
	height=1024,
	num_steps=num_steps,
	guidance=0.0,
	seed=seed,
	prompt=prompt,
	concepts=token_strings
	)
	# Pull out and average the attention maps
	cross_attention_maps = []
	for double_block in self.generator.model.double_blocks:
	cross_attention_map = torch.stack(
	double_block.cross_attention_maps
	).squeeze(1)
	# Clear out the layer (always same)
	double_block.clear_cached_vectors()
	# Append to the list
	cross_attention_maps.append(cross_attention_map)
	# Stack layers
	cross_attention_maps = torch.stack(cross_attention_maps).to(torch.float32)
	# Pull out the desired timesteps
	cross_attention_maps = cross_attention_maps[:, timesteps]
	# Pull out the desired layers
	cross_attention_maps = cross_attention_maps[layers]
	# Average over layers and time
	attention_maps = einops.reduce(
	cross_attention_maps,
	"layers time concepts height width -> concepts height width",
	reduction="mean"
	)
	# Pull out only token length attention maps
	attention_maps = attention_maps[:len(token_strings)]

	return attention_maps, token_strings, image