Spaces:

jamesliu1217
/

EasyControl

Running on Zero

App Files Files Community

jamesliu1217 commited on Mar 18

Commit

8f6d6cb

verified ·

1 Parent(s): 5544c41

Upload 7 files

Browse files

Files changed (7) hide show

app.py +183 -4
src/__init__.py +0 -0
src/layers_cache.py +368 -0
src/lora_helper.py +196 -0
src/pipeline.py +722 -0
src/prompt_helper.py +205 -0
src/transformer_flux.py +583 -0

app.py CHANGED Viewed

@@ -1,7 +1,186 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+import os
+import json
+import time
+import torch
+from PIL import Image
+from tqdm import tqdm
 import gradio as gr
+from safetensors.torch import save_file
+from src.pipeline import FluxPipeline
+from src.transformer_flux import FluxTransformer2DModel
+from src.lora_helper import set_single_lora, set_multi_lora, unset_lora
+class ImageProcessor:
+    def __init__(self, path):
+        device = "cuda"
+        self.pipe = FluxPipeline.from_pretrained(path, torch_dtype=torch.bfloat16, device=device)
+        transformer = FluxTransformer2DModel.from_pretrained(path, subfolder="transformer", torch_dtype=torch.bfloat16, device=device)
+        self.pipe.transformer = transformer
+        self.pipe.to(device)
+    def clear_cache(self, transformer):
+        for name, attn_processor in transformer.attn_processors.items():
+            attn_processor.bank_kv.clear()
+    def process_image(self, prompt='', subject_imgs=[], spatial_imgs=[], height=768, width=768, output_path=None, seed=42):
+        image = self.pipe(
+            prompt,
+            height=int(height),
+            width=int(width),
+            guidance_scale=3.5,
+            num_inference_steps=25,
+            max_sequence_length=512,
+            generator=torch.Generator("cpu").manual_seed(seed),
+            subject_images=subject_imgs,
+            spatial_images=spatial_imgs,
+            cond_size=512,
+        ).images[0]
+        self.clear_cache(self.pipe.transformer)
+        if output_path:
+            image.save(output_path)
+        return image
+# Initialize the image processor
+base_path = "/opt/liblibai-models/model-weights/black-forest-labs/FLUX.1-dev"
+lora_base_path = "/opt/liblibai-models/user-workspace/rey/projects/opensource/github/EasyControl/models"
+style_lora_base_path = "/opt/liblibai-models/user-workspace/zhangyuxuan/models/Shakker-Labs"
+processor = ImageProcessor(base_path)
+# Define the Gradio interface
+def single_condition_generate_image(prompt, subject_img, spatial_img, height, width, seed, control_type, style_lora=None):
+    # Set the control type
+    if control_type == "subject":
+        lora_path = os.path.join(lora_base_path, "subject.safetensors")
+    elif control_type == "depth":
+        lora_path = os.path.join(lora_base_path, "depth.safetensors")
+    elif control_type == "seg":
+        lora_path = os.path.join(lora_base_path, "seg.safetensors")
+    elif control_type == "pose":
+        lora_path = os.path.join(lora_base_path, "pose.safetensors")
+    elif control_type == "inpainting":
+        lora_path = os.path.join(lora_base_path, "inpainting.safetensors")
+    elif control_type == "hedsketch":
+        lora_path = os.path.join(lora_base_path, "hedsketch.safetensors")
+    elif control_type == "canny":
+        lora_path = os.path.join(lora_base_path, "canny.safetensors")
+    set_single_lora(processor.pipe.transformer, lora_path, lora_weights=[1], cond_size=512)
+    # Set the style LoRA
+    if style_lora=="None":
+        pass
+    else:
+        if style_lora == "Simple_Sketch":
+            processor.pipe.unload_lora_weights()
+            style_lora_path = os.path.join(style_lora_base_path, "FLUX.1-dev-LoRA-Children-Simple-Sketch/pytorch_lora_weights.safetensors")
+            processor.pipe.load_lora_weights(self.lora_path, weight_name="pytorch_lora_weights.safetensors")
+        if style_lora == "Text_Poster":
+            processor.pipe.unload_lora_weights()
+            style_lora_path = os.path.join(style_lora_base_path, "FLUX.1-dev-LoRA-Text-Poster/pytorch_lora_weights.safetensors")
+            processor.pipe.load_lora_weights(self.lora_path, weight_name="pytorch_lora_weights.safetensors")
+        if style_lora == "Vector_Style":
+            processor.pipe.unload_lora_weights()
+            style_lora_path = os.path.join(style_lora_base_path, "FLUX.1-dev-LoRA-Vector-Journey/pytorch_lora_weights.safetensors")
+            processor.pipe.load_lora_weights(style_lora_path, weight_name="pytorch_lora_weights.safetensors")
+    # Process the image
+    subject_imgs = [subject_img] if subject_img else []
+    spatial_imgs = [spatial_img] if spatial_img else []
+    image = processor.process_image(prompt=prompt, subject_imgs=subject_imgs, spatial_imgs=spatial_imgs, height=height, width=width, seed=seed)
+    return image
+# Define the Gradio interface
+def multi_condition_generate_image(prompt, subject_img, spatial_img, height, width, seed):
+    subject_path = os.path.join(lora_base_path, "subject.safetensors")
+    inpainting_path = os.path.join(lora_base_path, "inpainting.safetensors")
+    set_multi_lora(processor.pipe.transformer, [subject_path, inpainting_path], lora_weights=[[1],[1]],cond_size=512)
+    # Process the image
+    subject_imgs = [subject_img] if subject_img else []
+    spatial_imgs = [spatial_img] if spatial_img else []
+    image = processor.process_image(prompt=prompt, subject_imgs=subject_imgs, spatial_imgs=spatial_imgs, height=height, width=width, seed=seed)
+    return image
+# Define the Gradio interface components
+control_types = ["subject", "depth", "pose", "inpainting", "hedsketch", "seg", "canny"]
+style_loras = ["Simple_Sketch", "Text_Poster", "Vector_Style", "None"]
+# Example data
+single_examples = [
+    ["A SKS in the library", Image.open("/opt/liblibai-models/user-workspace/zhangyuxuan/project/easycontrol/inference0310/test_imgs/subject3.jpg"), None, 1024, 1024, 5, "subject", None],
+    ["In a picturesque village, a narrow cobblestone street with rustic stone buildings, colorful blinds, and lush green spaces, a cartoon man drawn with simple lines and solid colors stands in the foreground, wearing a red shirt, beige work pants, and brown shoes, carrying a strap on his shoulder. The scene features warm and enticing colors, a pleasant fusion of nature and architecture, and the camera's perspective on the street clearly shows the charming and quaint environment., Integrating elements of reality and cartoon.", None, Image.open("/opt/liblibai-models/user-workspace/zhangyuxuan/project/easycontrol/inference0310/test_imgs/openpose.png"), 1024, 1024, 1, "pose", "Vector_Style"],
+]
+multi_examples = [
+    ["A SKS on the car", Image.open("/opt/liblibai-models/user-workspace/zhangyuxuan/project/easycontrol/code0221/test_imgs/subject/s17.png"), Image.open("/opt/liblibai-models/user-workspace/zhangyuxuan/project/easycontrol/code0221/test_imgs/inpainting.png"), 1024, 1024, 7],
+]
+# Create the Gradio Blocks interface
+with gr.Blocks() as demo:
+    gr.Markdown("# Image Generation with EasyControl")
+    gr.Markdown("Generate images using EasyControl with different control types and style LoRAs.")
+    with gr.Tab("Single Condition Generation"):
+        with gr.Row():
+            with gr.Column():
+                prompt = gr.Textbox(label="Prompt")
+                subject_img = gr.Image(label="Subject Image", type="pil")  # 上传图像文件
+                spatial_img = gr.Image(label="Spatial Image", type="pil")  # 上传图像文件
+                height = gr.Slider(minimum=256, maximum=1536, step=64, label="Height", value=768)
+                width = gr.Slider(minimum=256, maximum=1536, step=64, label="Width", value=768)
+                seed = gr.Number(label="Seed", value=42)
+                control_type = gr.Dropdown(choices=control_types, label="Control Type")
+                style_lora = gr.Dropdown(choices=style_loras, label="Style LoRA")
+                single_generate_btn = gr.Button("Generate Image")
+            with gr.Column():
+                single_output_image = gr.Image(label="Generated Image")
+        # Add examples for Single Condition Generation
+        gr.Examples(
+            examples=single_examples,
+            inputs=[prompt, subject_img, spatial_img, height, width, seed, control_type, style_lora],
+            outputs=single_output_image,
+            fn=single_condition_generate_image,
+            cache_examples=True,  # 缓存示例结果以加快加载速度
+            label="Single Condition Examples"
+        )
+    with gr.Tab("Multi-Condition Generation"):
+        with gr.Row():
+            with gr.Column():
+                multi_prompt = gr.Textbox(label="Prompt")
+                multi_subject_img = gr.Image(label="Subject Image", type="pil")  # 上传图像文件
+                multi_spatial_img = gr.Image(label="Spatial Image", type="pil")  # 上传图像文件
+                multi_height = gr.Slider(minimum=256, maximum=1536, step=64, label="Height", value=768)
+                multi_width = gr.Slider(minimum=256, maximum=1536, step=64, label="Width", value=768)
+                multi_seed = gr.Number(label="Seed", value=42)
+                multi_generate_btn = gr.Button("Generate Image")
+            with gr.Column():
+                multi_output_image = gr.Image(label="Generated Image")
+        # Add examples for Multi-Condition Generation
+        gr.Examples(
+            examples=multi_examples,
+            inputs=[multi_prompt, multi_subject_img, multi_spatial_img, multi_height, multi_width, multi_seed],
+            outputs=multi_output_image,
+            fn=multi_condition_generate_image,
+            cache_examples=True,  # 缓存示例结果以加快加载速度
+            label="Multi-Condition Examples"
+        )
+    # Link the buttons to the functions
+    single_generate_btn.click(
+        single_condition_generate_image,
+        inputs=[prompt, subject_img, spatial_img, height, width, seed, control_type, style_lora],
+        outputs=single_output_image
+    )
+    multi_generate_btn.click(
+        multi_condition_generate_image,
+        inputs=[multi_prompt, multi_subject_img, multi_spatial_img, multi_height, multi_width, multi_seed],
+        outputs=multi_output_image
+    )
+# Launch the Gradio app
+demo.queue().launch(server_name='0.0.0.0',server_port=7861)

src/__init__.py ADDED Viewed

File without changes

src/layers_cache.py ADDED Viewed

	@@ -0,0 +1,368 @@

+import inspect
+import math
+from typing import Callable, List, Optional, Tuple, Union
+from einops import rearrange
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch import Tensor
+from diffusers.models.attention_processor import Attention
+class LoRALinearLayer(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        rank: int = 4,
+        network_alpha: Optional[float] = None,
+        device: Optional[Union[torch.device, str]] = None,
+        dtype: Optional[torch.dtype] = None,
+        cond_width=512,
+        cond_height=512,
+        number=0,
+        n_loras=1
+    ):
+        super().__init__()
+        self.down = nn.Linear(in_features, rank, bias=False, device=device, dtype=dtype)
+        self.up = nn.Linear(rank, out_features, bias=False, device=device, dtype=dtype)
+        # This value has the same meaning as the `--network_alpha` option in the kohya-ss trainer script.
+        # See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
+        self.network_alpha = network_alpha
+        self.rank = rank
+        self.out_features = out_features
+        self.in_features = in_features
+        nn.init.normal_(self.down.weight, std=1 / rank)
+        nn.init.zeros_(self.up.weight)
+        self.cond_height = cond_height
+        self.cond_width = cond_width
+        self.number = number
+        self.n_loras = n_loras
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        orig_dtype = hidden_states.dtype
+        dtype = self.down.weight.dtype
+        ####
+        batch_size = hidden_states.shape[0]
+        cond_size = self.cond_width // 8 * self.cond_height // 8 * 16 // 64
+        block_size =  hidden_states.shape[1] - cond_size * self.n_loras
+        shape = (batch_size, hidden_states.shape[1], 3072)
+        mask = torch.ones(shape, device=hidden_states.device, dtype=dtype)
+        mask[:, :block_size+self.number*cond_size, :] = 0
+        mask[:, block_size+(self.number+1)*cond_size:, :] = 0
+        hidden_states = mask * hidden_states
+        ####
+        down_hidden_states = self.down(hidden_states.to(dtype))
+        up_hidden_states = self.up(down_hidden_states)
+        if self.network_alpha is not None:
+            up_hidden_states *= self.network_alpha / self.rank
+        return up_hidden_states.to(orig_dtype)
+class MultiSingleStreamBlockLoraProcessor(nn.Module):
+    def __init__(self, dim: int, ranks=[], lora_weights=[], network_alphas=[], device=None, dtype=None, cond_width=512, cond_height=512, n_loras=1):
+        super().__init__()
+        # Initialize a list to store the LoRA layers
+        self.n_loras = n_loras
+        self.cond_width = cond_width
+        self.cond_height = cond_height
+        self.q_loras = nn.ModuleList([
+            LoRALinearLayer(dim, dim, ranks[i],network_alphas[i], device=device, dtype=dtype, cond_width=cond_width, cond_height=cond_height, number=i, n_loras=n_loras)
+            for i in range(n_loras)
+        ])
+        self.k_loras = nn.ModuleList([
+            LoRALinearLayer(dim, dim, ranks[i],network_alphas[i], device=device, dtype=dtype, cond_width=cond_width, cond_height=cond_height, number=i, n_loras=n_loras)
+            for i in range(n_loras)
+        ])
+        self.v_loras = nn.ModuleList([
+            LoRALinearLayer(dim, dim, ranks[i],network_alphas[i], device=device, dtype=dtype, cond_width=cond_width, cond_height=cond_height, number=i, n_loras=n_loras)
+            for i in range(n_loras)
+        ])
+        self.lora_weights = lora_weights
+        self.bank_attn = None
+        self.bank_kv = []
+    def __call__(self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        use_cond = False
+    ) -> torch.FloatTensor:
+        batch_size, seq_len, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        scaled_seq_len = hidden_states.shape[1]
+        cond_size = self.cond_width // 8 * self.cond_height // 8 * 16 // 64
+        block_size =  scaled_seq_len - cond_size * self.n_loras
+        scaled_cond_size = cond_size
+        scaled_block_size = block_size
+        if len(self.bank_kv)== 0:
+            cache = True
+        else:
+            cache = False
+        if cache:
+            query = attn.to_q(hidden_states)
+            key = attn.to_k(hidden_states)
+            value = attn.to_v(hidden_states)
+            for i in range(self.n_loras):
+                query = query + self.lora_weights[i] * self.q_loras[i](hidden_states)
+                key = key + self.lora_weights[i] * self.k_loras[i](hidden_states)
+                value = value + self.lora_weights[i] * self.v_loras[i](hidden_states)
+            inner_dim = key.shape[-1]
+            head_dim = inner_dim // attn.heads
+            query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            self.bank_kv.append(key[:, :, scaled_block_size:, :])
+            self.bank_kv.append(value[:, :, scaled_block_size:, :])
+            if attn.norm_q is not None:
+                query = attn.norm_q(query)
+            if attn.norm_k is not None:
+                key = attn.norm_k(key)
+            if image_rotary_emb is not None:
+                from diffusers.models.embeddings import apply_rotary_emb
+                query = apply_rotary_emb(query, image_rotary_emb)
+                key = apply_rotary_emb(key, image_rotary_emb)
+            num_cond_blocks = self.n_loras
+            mask = torch.ones((scaled_seq_len, scaled_seq_len), device=hidden_states.device)
+            mask[ :scaled_block_size, :] = 0  # First block_size row
+            for i in range(num_cond_blocks):
+                start = i * scaled_cond_size + scaled_block_size
+                end = (i + 1) * scaled_cond_size + scaled_block_size
+                mask[start:end, start:end] = 0  # Diagonal blocks
+            mask = mask * -1e20
+            mask = mask.to(query.dtype)
+            hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False, attn_mask=mask)
+            self.bank_attn = hidden_states[:, :, scaled_block_size:, :]
+        else:
+            query = attn.to_q(hidden_states)
+            key = attn.to_k(hidden_states)
+            value = attn.to_v(hidden_states)
+            inner_dim = query.shape[-1]
+            head_dim = inner_dim // attn.heads
+            query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            key = torch.concat([key[:, :, :scaled_block_size, :], self.bank_kv[0]], dim=-2)
+            value = torch.concat([value[:, :, :scaled_block_size, :], self.bank_kv[1]], dim=-2)
+            if attn.norm_q is not None:
+                query = attn.norm_q(query)
+            if attn.norm_k is not None:
+                key = attn.norm_k(key)
+            if image_rotary_emb is not None:
+                from diffusers.models.embeddings import apply_rotary_emb
+                query = apply_rotary_emb(query, image_rotary_emb)
+                key = apply_rotary_emb(key, image_rotary_emb)
+            query = query[:, :, :scaled_block_size, :]
+            hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False, attn_mask=None)
+            hidden_states = torch.concat([hidden_states, self.bank_attn], dim=-2)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        cond_hidden_states = hidden_states[:, block_size:,:]
+        hidden_states = hidden_states[:, : block_size,:]
+        return hidden_states if not use_cond else (hidden_states, cond_hidden_states)
+class MultiDoubleStreamBlockLoraProcessor(nn.Module):
+    def __init__(self, dim: int, ranks=[], lora_weights=[], network_alphas=[], device=None, dtype=None, cond_width=512, cond_height=512, n_loras=1):
+        super().__init__()
+        # Initialize a list to store the LoRA layers
+        self.n_loras = n_loras
+        self.cond_width = cond_width
+        self.cond_height = cond_height
+        self.q_loras = nn.ModuleList([
+            LoRALinearLayer(dim, dim, ranks[i],network_alphas[i], device=device, dtype=dtype, cond_width=cond_width, cond_height=cond_height, number=i, n_loras=n_loras)
+            for i in range(n_loras)
+        ])
+        self.k_loras = nn.ModuleList([
+            LoRALinearLayer(dim, dim, ranks[i],network_alphas[i], device=device, dtype=dtype, cond_width=cond_width, cond_height=cond_height, number=i, n_loras=n_loras)
+            for i in range(n_loras)
+        ])
+        self.v_loras = nn.ModuleList([
+            LoRALinearLayer(dim, dim, ranks[i],network_alphas[i], device=device, dtype=dtype, cond_width=cond_width, cond_height=cond_height, number=i, n_loras=n_loras)
+            for i in range(n_loras)
+        ])
+        self.proj_loras = nn.ModuleList([
+            LoRALinearLayer(dim, dim, ranks[i],network_alphas[i], device=device, dtype=dtype, cond_width=cond_width, cond_height=cond_height, number=i, n_loras=n_loras)
+            for i in range(n_loras)
+        ])
+        self.lora_weights = lora_weights
+        self.bank_attn = None
+        self.bank_kv = []
+    def __call__(self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        use_cond=False,
+    ) -> torch.FloatTensor:
+        batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        cond_size = self.cond_width // 8 * self.cond_height // 8 * 16 // 64
+        block_size =  hidden_states.shape[1] - cond_size * self.n_loras
+        scaled_seq_len = encoder_hidden_states.shape[1] + hidden_states.shape[1]
+        scaled_cond_size = cond_size
+        scaled_block_size = scaled_seq_len - scaled_cond_size * self.n_loras
+        # `context` projections.
+        inner_dim = 3072
+        head_dim = inner_dim // attn.heads
+        encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+        encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        if attn.norm_added_q is not None:
+            encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj)
+        if attn.norm_added_k is not None:
+            encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj)
+        if len(self.bank_kv)== 0:
+            cache = True
+        else:
+            cache = False
+        if cache:
+            query = attn.to_q(hidden_states)
+            key = attn.to_k(hidden_states)
+            value = attn.to_v(hidden_states)
+            for i in range(self.n_loras):
+                query = query + self.lora_weights[i] * self.q_loras[i](hidden_states)
+                key = key + self.lora_weights[i] * self.k_loras[i](hidden_states)
+                value = value + self.lora_weights[i] * self.v_loras[i](hidden_states)
+            inner_dim = key.shape[-1]
+            head_dim = inner_dim // attn.heads
+            query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            self.bank_kv.append(key[:, :, block_size:, :])
+            self.bank_kv.append(value[:, :, block_size:, :])
+            if attn.norm_q is not None:
+                query = attn.norm_q(query)
+            if attn.norm_k is not None:
+                key = attn.norm_k(key)
+            # attention
+            query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
+            if image_rotary_emb is not None:
+                from diffusers.models.embeddings import apply_rotary_emb
+                query = apply_rotary_emb(query, image_rotary_emb)
+                key = apply_rotary_emb(key, image_rotary_emb)
+            num_cond_blocks = self.n_loras
+            mask = torch.ones((scaled_seq_len, scaled_seq_len), device=hidden_states.device)
+            mask[ :scaled_block_size, :] = 0  # First block_size row
+            for i in range(num_cond_blocks):
+                start = i * scaled_cond_size + scaled_block_size
+                end = (i + 1) * scaled_cond_size + scaled_block_size
+                mask[start:end, start:end] = 0  # Diagonal blocks
+            mask = mask * -1e20
+            mask = mask.to(query.dtype)
+            hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False, attn_mask=mask)
+            self.bank_attn = hidden_states[:, :, scaled_block_size:, :]
+        else:
+            query = attn.to_q(hidden_states)
+            key = attn.to_k(hidden_states)
+            value = attn.to_v(hidden_states)
+            inner_dim = query.shape[-1]
+            head_dim = inner_dim // attn.heads
+            query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            key = torch.concat([key[:, :, :block_size, :], self.bank_kv[0]], dim=-2)
+            value = torch.concat([value[:, :, :block_size, :], self.bank_kv[1]], dim=-2)
+            if attn.norm_q is not None:
+                query = attn.norm_q(query)
+            if attn.norm_k is not None:
+                key = attn.norm_k(key)
+            # attention
+            query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
+            if image_rotary_emb is not None:
+                from diffusers.models.embeddings import apply_rotary_emb
+                query = apply_rotary_emb(query, image_rotary_emb)
+                key = apply_rotary_emb(key, image_rotary_emb)
+            query = query[:, :, :scaled_block_size, :]
+            hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False, attn_mask=None)
+            hidden_states = torch.concat([hidden_states, self.bank_attn], dim=-2)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        encoder_hidden_states, hidden_states = (
+            hidden_states[:, : encoder_hidden_states.shape[1]],
+            hidden_states[:, encoder_hidden_states.shape[1] :],
+        )
+        # Linear projection (with LoRA weight applied to each proj layer)
+        hidden_states = attn.to_out[0](hidden_states)
+        for i in range(self.n_loras):
+             hidden_states = hidden_states + self.lora_weights[i] * self.proj_loras[i](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+        cond_hidden_states = hidden_states[:, block_size:,:]
+        hidden_states = hidden_states[:, :block_size,:]
+        return (hidden_states, encoder_hidden_states, cond_hidden_states) if use_cond else (encoder_hidden_states, hidden_states)

src/lora_helper.py ADDED Viewed

	@@ -0,0 +1,196 @@

+from diffusers.models.attention_processor import FluxAttnProcessor2_0
+from safetensors import safe_open
+import re
+import torch
+from .layers_cache import MultiDoubleStreamBlockLoraProcessor, MultiSingleStreamBlockLoraProcessor
+device = "cuda"
+def load_safetensors(path):
+    tensors = {}
+    with safe_open(path, framework="pt", device="cpu") as f:
+        for key in f.keys():
+            tensors[key] = f.get_tensor(key)
+    return tensors
+def get_lora_rank(checkpoint):
+    for k in checkpoint.keys():
+        if k.endswith(".down.weight"):
+            return checkpoint[k].shape[0]
+def load_checkpoint(local_path):
+    if local_path is not None:
+        if '.safetensors' in local_path:
+            print(f"Loading .safetensors checkpoint from {local_path}")
+            checkpoint = load_safetensors(local_path)
+        else:
+            print(f"Loading checkpoint from {local_path}")
+            checkpoint = torch.load(local_path, map_location='cpu')
+    return checkpoint
+def update_model_with_lora(checkpoint, lora_weights, transformer, cond_size):
+        number = len(lora_weights)
+        ranks = [get_lora_rank(checkpoint) for _ in range(number)]
+        lora_attn_procs = {}
+        double_blocks_idx = list(range(19))
+        single_blocks_idx = list(range(38))
+        for name, attn_processor in transformer.attn_processors.items():
+            match = re.search(r'\.(\d+)\.', name)
+            if match:
+                layer_index = int(match.group(1))
+            if name.startswith("transformer_blocks") and layer_index in double_blocks_idx:
+                lora_state_dicts = {}
+                for key, value in checkpoint.items():
+                    # Match based on the layer index in the key (assuming the key contains layer index)
+                    if re.search(r'\.(\d+)\.', key):
+                        checkpoint_layer_index = int(re.search(r'\.(\d+)\.', key).group(1))
+                        if checkpoint_layer_index == layer_index and key.startswith("transformer_blocks"):
+                            lora_state_dicts[key] = value
+                lora_attn_procs[name] = MultiDoubleStreamBlockLoraProcessor(
+                    dim=3072, ranks=ranks, network_alphas=ranks, lora_weights=lora_weights, device=device, dtype=torch.bfloat16, cond_width=cond_size, cond_height=cond_size, n_loras=number
+                )
+                # Load the weights from the checkpoint dictionary into the corresponding layers
+                for n in range(number):
+                    lora_attn_procs[name].q_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.q_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].q_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.q_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].k_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.k_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].k_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.k_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].v_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.v_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].v_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.v_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].proj_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.proj_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].proj_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.proj_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].to(device)
+            elif name.startswith("single_transformer_blocks") and layer_index in single_blocks_idx:
+                lora_state_dicts = {}
+                for key, value in checkpoint.items():
+                    # Match based on the layer index in the key (assuming the key contains layer index)
+                    if re.search(r'\.(\d+)\.', key):
+                        checkpoint_layer_index = int(re.search(r'\.(\d+)\.', key).group(1))
+                        if checkpoint_layer_index == layer_index and key.startswith("single_transformer_blocks"):
+                            lora_state_dicts[key] = value
+                lora_attn_procs[name] = MultiSingleStreamBlockLoraProcessor(
+                    dim=3072, ranks=ranks, network_alphas=ranks, lora_weights=lora_weights, device=device, dtype=torch.bfloat16, cond_width=cond_size, cond_height=cond_size, n_loras=number
+                )
+                # Load the weights from the checkpoint dictionary into the corresponding layers
+                for n in range(number):
+                    lora_attn_procs[name].q_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.q_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].q_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.q_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].k_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.k_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].k_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.k_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].v_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.v_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].v_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.v_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].to(device)
+            else:
+                lora_attn_procs[name] = FluxAttnProcessor2_0()
+        transformer.set_attn_processor(lora_attn_procs)
+def update_model_with_multi_lora(checkpoints, lora_weights, transformer, cond_size):
+        ck_number = len(checkpoints)
+        cond_lora_number = [len(ls) for ls in lora_weights]
+        cond_number = sum(cond_lora_number)
+        ranks = [get_lora_rank(checkpoint) for checkpoint in checkpoints]
+        multi_lora_weight = []
+        for ls in lora_weights:
+            for n in ls:
+                multi_lora_weight.append(n)
+        lora_attn_procs = {}
+        double_blocks_idx = list(range(19))
+        single_blocks_idx = list(range(38))
+        for name, attn_processor in transformer.attn_processors.items():
+            match = re.search(r'\.(\d+)\.', name)
+            if match:
+                layer_index = int(match.group(1))
+            if name.startswith("transformer_blocks") and layer_index in double_blocks_idx:
+                lora_state_dicts = [{} for _ in range(ck_number)]
+                for idx, checkpoint in enumerate(checkpoints):
+                    for key, value in checkpoint.items():
+                        # Match based on the layer index in the key (assuming the key contains layer index)
+                        if re.search(r'\.(\d+)\.', key):
+                            checkpoint_layer_index = int(re.search(r'\.(\d+)\.', key).group(1))
+                            if checkpoint_layer_index == layer_index and key.startswith("transformer_blocks"):
+                                lora_state_dicts[idx][key] = value
+                lora_attn_procs[name] = MultiDoubleStreamBlockLoraProcessor(
+                    dim=3072, ranks=ranks, network_alphas=ranks, lora_weights=multi_lora_weight, device=device, dtype=torch.bfloat16, cond_width=cond_size, cond_height=cond_size, n_loras=cond_number
+                )
+                # Load the weights from the checkpoint dictionary into the corresponding layers
+                num = 0
+                for idx in range(ck_number):
+                    for n in range(cond_lora_number[idx]):
+                        lora_attn_procs[name].q_loras[num].down.weight.data = lora_state_dicts[idx].get(f'{name}.q_loras.{n}.down.weight', None)
+                        lora_attn_procs[name].q_loras[num].up.weight.data = lora_state_dicts[idx].get(f'{name}.q_loras.{n}.up.weight', None)
+                        lora_attn_procs[name].k_loras[num].down.weight.data = lora_state_dicts[idx].get(f'{name}.k_loras.{n}.down.weight', None)
+                        lora_attn_procs[name].k_loras[num].up.weight.data = lora_state_dicts[idx].get(f'{name}.k_loras.{n}.up.weight', None)
+                        lora_attn_procs[name].v_loras[num].down.weight.data = lora_state_dicts[idx].get(f'{name}.v_loras.{n}.down.weight', None)
+                        lora_attn_procs[name].v_loras[num].up.weight.data = lora_state_dicts[idx].get(f'{name}.v_loras.{n}.up.weight', None)
+                        lora_attn_procs[name].proj_loras[num].down.weight.data = lora_state_dicts[idx].get(f'{name}.proj_loras.{n}.down.weight', None)
+                        lora_attn_procs[name].proj_loras[num].up.weight.data = lora_state_dicts[idx].get(f'{name}.proj_loras.{n}.up.weight', None)
+                        lora_attn_procs[name].to(device)
+                        num += 1
+            elif name.startswith("single_transformer_blocks") and layer_index in single_blocks_idx:
+                lora_state_dicts = [{} for _ in range(ck_number)]
+                for idx, checkpoint in enumerate(checkpoints):
+                    for key, value in checkpoint.items():
+                        # Match based on the layer index in the key (assuming the key contains layer index)
+                        if re.search(r'\.(\d+)\.', key):
+                            checkpoint_layer_index = int(re.search(r'\.(\d+)\.', key).group(1))
+                            if checkpoint_layer_index == layer_index and key.startswith("single_transformer_blocks"):
+                                lora_state_dicts[idx][key] = value
+                lora_attn_procs[name] = MultiSingleStreamBlockLoraProcessor(
+                    dim=3072, ranks=ranks, network_alphas=ranks, lora_weights=multi_lora_weight, device=device, dtype=torch.bfloat16, cond_width=cond_size, cond_height=cond_size, n_loras=cond_number
+                )
+                # Load the weights from the checkpoint dictionary into the corresponding layers
+                num = 0
+                for idx in range(ck_number):
+                    for n in range(cond_lora_number[idx]):
+                        lora_attn_procs[name].q_loras[num].down.weight.data = lora_state_dicts[idx].get(f'{name}.q_loras.{n}.down.weight', None)
+                        lora_attn_procs[name].q_loras[num].up.weight.data = lora_state_dicts[idx].get(f'{name}.q_loras.{n}.up.weight', None)
+                        lora_attn_procs[name].k_loras[num].down.weight.data = lora_state_dicts[idx].get(f'{name}.k_loras.{n}.down.weight', None)
+                        lora_attn_procs[name].k_loras[num].up.weight.data = lora_state_dicts[idx].get(f'{name}.k_loras.{n}.up.weight', None)
+                        lora_attn_procs[name].v_loras[num].down.weight.data = lora_state_dicts[idx].get(f'{name}.v_loras.{n}.down.weight', None)
+                        lora_attn_procs[name].v_loras[num].up.weight.data = lora_state_dicts[idx].get(f'{name}.v_loras.{n}.up.weight', None)
+                        lora_attn_procs[name].to(device)
+                        num += 1
+            else:
+                lora_attn_procs[name] = FluxAttnProcessor2_0()
+        transformer.set_attn_processor(lora_attn_procs)
+def set_single_lora(transformer, local_path, lora_weights=[], cond_size=512):
+    checkpoint = load_checkpoint(local_path)
+    update_model_with_lora(checkpoint, lora_weights, transformer, cond_size)
+def set_multi_lora(transformer, local_paths, lora_weights=[[]], cond_size=512):
+    checkpoints = [load_checkpoint(local_path) for local_path in local_paths]
+    update_model_with_multi_lora(checkpoints, lora_weights, transformer, cond_size)
+def unset_lora(transformer):
+    lora_attn_procs = {}
+    for name, attn_processor in transformer.attn_processors.items():
+        lora_attn_procs[name] = FluxAttnProcessor2_0()
+    transformer.set_attn_processor(lora_attn_procs)
+'''
+unset_lora(pipe.transformer)
+lora_path = "./lora.safetensors"
+lora_weights = [1, 1]
+set_lora(pipe.transformer, local_path=lora_path, lora_weights=lora_weights, cond_size=512)
+'''

src/pipeline.py ADDED Viewed

	@@ -0,0 +1,722 @@

+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+import numpy as np
+import torch
+from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast
+from diffusers.image_processor import (VaeImageProcessor)
+from diffusers.loaders import FluxLoraLoaderMixin, FromSingleFileMixin
+from diffusers.models.autoencoders import AutoencoderKL
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    is_torch_xla_available,
+    logging,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.flux.pipeline_output import FluxPipelineOutput
+from torchvision.transforms.functional import pad
+from .transformer_flux import FluxTransformer2DModel
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def calculate_shift(
+        image_seq_len,
+        base_seq_len: int = 256,
+        max_seq_len: int = 4096,
+        base_shift: float = 0.5,
+        max_shift: float = 1.16,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+def prepare_latent_image_ids_(height, width, device, dtype):
+    latent_image_ids = torch.zeros(height//2, width//2, 3, device=device, dtype=dtype)
+    latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height//2, device=device)[:, None]  # y
+    latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width//2, device=device)[None, :]   # x
+    return latent_image_ids
+def prepare_latent_subject_ids(height, width, device, dtype):
+    latent_image_ids = torch.zeros(height // 2, width // 2, 3, device=device, dtype=dtype)
+    latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2, device=device)[:, None]
+    latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2, device=device)[None, :]
+    latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+    latent_image_ids = latent_image_ids.reshape(
+        latent_image_id_height * latent_image_id_width, latent_image_id_channels
+    )
+    return latent_image_ids.to(device=device, dtype=dtype)
+def resize_position_encoding(batch_size, original_height, original_width, target_height, target_width, device, dtype):
+    latent_image_ids = prepare_latent_image_ids_(original_height, original_width, device, dtype)
+    latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+    latent_image_ids = latent_image_ids.reshape(
+        latent_image_id_height * latent_image_id_width, latent_image_id_channels
+    )
+    scale_h = original_height / target_height
+    scale_w = original_width / target_width
+    latent_image_ids_resized = torch.zeros(target_height//2, target_width//2, 3, device=device, dtype=dtype)
+    latent_image_ids_resized[..., 1] = latent_image_ids_resized[..., 1] + torch.arange(target_height//2, device=device)[:, None] * scale_h
+    latent_image_ids_resized[..., 2] = latent_image_ids_resized[..., 2] + torch.arange(target_width//2, device=device)[None, :] * scale_w
+    cond_latent_image_id_height, cond_latent_image_id_width, cond_latent_image_id_channels = latent_image_ids_resized.shape
+    cond_latent_image_ids = latent_image_ids_resized.reshape(
+            cond_latent_image_id_height * cond_latent_image_id_width, cond_latent_image_id_channels
+        )
+    return latent_image_ids, cond_latent_image_ids
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+        encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+        scheduler,
+        num_inference_steps: Optional[int] = None,
+        device: Optional[Union[str, torch.device]] = None,
+        timesteps: Optional[List[int]] = None,
+        sigmas: Optional[List[float]] = None,
+        **kwargs,
+):
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+class FluxPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixin):
+    def __init__(
+            self,
+            scheduler: FlowMatchEulerDiscreteScheduler,
+            vae: AutoencoderKL,
+            text_encoder: CLIPTextModel,
+            tokenizer: CLIPTokenizer,
+            text_encoder_2: T5EncoderModel,
+            tokenizer_2: T5TokenizerFast,
+            transformer: FluxTransformer2DModel,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = (
+            2 ** (len(self.vae.config.block_out_channels)) if hasattr(self, "vae") and self.vae is not None else 16
+        )
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.tokenizer_max_length = (
+            self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
+        )
+        self.default_sample_size = 64
+    def _get_t5_prompt_embeds(
+            self,
+            prompt: Union[str, List[str]] = None,
+            num_images_per_prompt: int = 1,
+            max_sequence_length: int = 512,
+            device: Optional[torch.device] = None,
+            dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        text_inputs = self.tokenizer_2(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer_2(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer_2.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1: -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder_2(text_input_ids.to(device), output_hidden_states=False)[0]
+        dtype = self.text_encoder_2.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        _, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        return prompt_embeds
+    def _get_clip_prompt_embeds(
+            self,
+            prompt: Union[str, List[str]],
+            num_images_per_prompt: int = 1,
+            device: Optional[torch.device] = None,
+    ):
+        device = device or self._execution_device
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_overflowing_tokens=False,
+            return_length=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1: -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer_max_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), output_hidden_states=False)
+        # Use pooled output of CLIPTextModel
+        prompt_embeds = prompt_embeds.pooler_output
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+        return prompt_embeds
+    def encode_prompt(
+            self,
+            prompt: Union[str, List[str]],
+            prompt_2: Union[str, List[str]],
+            device: Optional[torch.device] = None,
+            num_images_per_prompt: int = 1,
+            prompt_embeds: Optional[torch.FloatTensor] = None,
+            pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+            max_sequence_length: int = 512,
+            lora_scale: Optional[float] = None,
+    ):
+        device = device or self._execution_device
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, FluxLoraLoaderMixin):
+            self._lora_scale = lora_scale
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+            if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder_2, lora_scale)
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+            # We only use the pooled prompt output from the CLIPTextModel
+            pooled_prompt_embeds = self._get_clip_prompt_embeds(
+                prompt=prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+            )
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt_2,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+        if self.text_encoder is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder_2 is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+        dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
+        text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+        return prompt_embeds, pooled_prompt_embeds, text_ids
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_inpaint.StableDiffusion3InpaintPipeline._encode_vae_image
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i: i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+        image_latents = (image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+        return image_latents
+    def check_inputs(
+            self,
+            prompt,
+            prompt_2,
+            height,
+            width,
+            prompt_embeds=None,
+            pooled_prompt_embeds=None,
+            callback_on_step_end_tensor_inputs=None,
+            max_sequence_length=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+        if max_sequence_length is not None and max_sequence_length > 512:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
+    @staticmethod
+    def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
+        latent_image_ids = torch.zeros(height // 2, width // 2, 3)
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2)[None, :]
+        latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+        latent_image_ids = latent_image_ids.reshape(
+            latent_image_id_height * latent_image_id_width, latent_image_id_channels
+        )
+        return latent_image_ids.to(device=device, dtype=dtype)
+    @staticmethod
+    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
+        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
+        latents = latents.permute(0, 2, 4, 1, 3, 5)
+        latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
+        return latents
+    @staticmethod
+    def _unpack_latents(latents, height, width, vae_scale_factor):
+        batch_size, num_patches, channels = latents.shape
+        height = height // vae_scale_factor
+        width = width // vae_scale_factor
+        latents = latents.view(batch_size, height, width, channels // 4, 2, 2)
+        latents = latents.permute(0, 3, 1, 4, 2, 5)
+        latents = latents.reshape(batch_size, channels // (2 * 2), height * 2, width * 2)
+        return latents
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+    def prepare_latents(
+            self,
+            batch_size,
+            num_channels_latents,
+            height,
+            width,
+            dtype,
+            device,
+            generator,
+            subject_image,
+            condition_image,
+            latents=None,
+            cond_number=1,
+            sub_number=1
+    ):
+        height_cond = 2 * (self.cond_size // self.vae_scale_factor)
+        width_cond = 2 * (self.cond_size // self.vae_scale_factor)
+        height = 2 * (int(height) // self.vae_scale_factor)
+        width = 2 * (int(width) // self.vae_scale_factor)
+        shape = (batch_size, num_channels_latents, height, width)  # 1 16 106 80
+        noise_latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        noise_latents = self._pack_latents(noise_latents, batch_size, num_channels_latents, height, width)
+        noise_latent_image_ids, cond_latent_image_ids = resize_position_encoding(
+                batch_size,
+                height,
+                width,
+                height_cond,
+                width_cond,
+                device,
+                dtype,
+            )
+        latents_to_concat = []
+        latents_ids_to_concat = [noise_latent_image_ids]
+        # subject
+        if subject_image is not None:
+            shape_subject = (batch_size, num_channels_latents, height_cond*sub_number, width_cond)
+            subject_image = subject_image.to(device=device, dtype=dtype)
+            subject_image_latents = self._encode_vae_image(image=subject_image, generator=generator)
+            subject_latents = self._pack_latents(subject_image_latents, batch_size, num_channels_latents, height_cond*sub_number, width_cond)
+            mask2 = torch.zeros(shape_subject, device=device, dtype=dtype)
+            mask2 = self._pack_latents(mask2, batch_size, num_channels_latents, height_cond*sub_number, width_cond)
+            latent_subject_ids = prepare_latent_subject_ids(height_cond, width_cond, device, dtype)
+            latent_subject_ids[:, 1] += 64  # fixed offset
+            subject_latent_image_ids = torch.concat([latent_subject_ids for _ in range(sub_number)], dim=-2)
+            latents_to_concat.append(subject_latents)
+            latents_ids_to_concat.append(subject_latent_image_ids)
+        # spatial
+        if condition_image is not None:
+            shape_cond = (batch_size, num_channels_latents, height_cond*cond_number, width_cond)
+            condition_image = condition_image.to(device=device, dtype=dtype)
+            image_latents = self._encode_vae_image(image=condition_image, generator=generator)
+            cond_latents = self._pack_latents(image_latents, batch_size, num_channels_latents, height_cond*cond_number, width_cond)
+            mask3 = torch.zeros(shape_cond, device=device, dtype=dtype)
+            mask3 = self._pack_latents(mask3, batch_size, num_channels_latents, height_cond*cond_number, width_cond)
+            cond_latent_image_ids = cond_latent_image_ids
+            cond_latent_image_ids = torch.concat([cond_latent_image_ids for _ in range(cond_number)], dim=-2)
+            latents_ids_to_concat.append(cond_latent_image_ids)
+            latents_to_concat.append(cond_latents)
+        cond_latents = torch.concat(latents_to_concat, dim=-2)
+        latent_image_ids = torch.concat(latents_ids_to_concat, dim=-2)
+        return cond_latents, latent_image_ids, noise_latents
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def interrupt(self):
+        return self._interrupt
+    @torch.no_grad()
+    def __call__(
+            self,
+            prompt: Union[str, List[str]] = None,
+            prompt_2: Optional[Union[str, List[str]]] = None,
+            height: Optional[int] = None,
+            width: Optional[int] = None,
+            num_inference_steps: int = 28,
+            timesteps: List[int] = None,
+            guidance_scale: float = 3.5,
+            num_images_per_prompt: Optional[int] = 1,
+            generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+            latents: Optional[torch.FloatTensor] = None,
+            prompt_embeds: Optional[torch.FloatTensor] = None,
+            pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+            output_type: Optional[str] = "pil",
+            return_dict: bool = True,
+            joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+            callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+            callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+            max_sequence_length: int = 512,
+            spatial_images=None,
+            subject_images=None,
+            cond_size=512,
+    ):
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        self.cond_size = cond_size
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._interrupt = False
+        cond_number = len(spatial_images)
+        sub_number = len(subject_images)
+        if sub_number > 0:
+            subject_image_ls = []
+            for subject_image in subject_images:
+                w, h = subject_image.size[:2]
+                scale = self.cond_size / max(h, w)
+                new_h, new_w = int(h * scale), int(w * scale)
+                subject_image = self.image_processor.preprocess(subject_image, height=new_h, width=new_w)
+                subject_image = subject_image.to(dtype=torch.float32)
+                pad_h = cond_size - subject_image.shape[-2]
+                pad_w = cond_size - subject_image.shape[-1]
+                subject_image = pad(
+                    subject_image,
+                    padding=(int(pad_w / 2), int(pad_h / 2), int(pad_w / 2), int(pad_h / 2)),
+                    fill=0
+                )
+                subject_image_ls.append(subject_image)
+            subject_image = torch.concat(subject_image_ls, dim=-2)
+        else:
+            subject_image = None
+        if cond_number > 0:
+            condition_image_ls = []
+            for img in spatial_images:
+                condition_image = self.image_processor.preprocess(img, height=self.cond_size, width=self.cond_size)
+                condition_image = condition_image.to(dtype=torch.float32)
+                condition_image_ls.append(condition_image)
+            condition_image = torch.concat(condition_image_ls, dim=-2)
+        else:
+            condition_image = None
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        lora_scale = (
+            self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            pooled_prompt_embeds,
+            text_ids,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            lora_scale=lora_scale,
+        )
+        # 4. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels // 4  # 16
+        cond_latents, latent_image_ids, noise_latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            subject_image,
+            condition_image,
+            latents,
+            cond_number,
+            sub_number
+        )
+        latents = noise_latents
+        # 5. Prepare timesteps
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+        image_seq_len = latents.shape[1]
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.base_image_seq_len,
+            self.scheduler.config.max_image_seq_len,
+            self.scheduler.config.base_shift,
+            self.scheduler.config.max_shift,
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            timesteps,
+            sigmas,
+            mu=mu,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+        # handle guidance
+        if self.transformer.config.guidance_embeds:
+            guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+            guidance = guidance.expand(latents.shape[0])
+        else:
+            guidance = None
+        # 6. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latents.shape[0]).to(latents.dtype)
+                noise_pred = self.transformer(
+                    hidden_states=latents,
+                    cond_hidden_states=cond_latents,
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )[0]
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                latents = latents
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        if output_type == "latent":
+            image = latents
+        else:
+            latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+            image = self.vae.decode(latents.to(dtype=self.vae.dtype), return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return FluxPipelineOutput(images=image)

src/prompt_helper.py ADDED Viewed

	@@ -0,0 +1,205 @@

+import torch
+def load_text_encoders(args, class_one, class_two):
+    text_encoder_one = class_one.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+    )
+    text_encoder_two = class_two.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision, variant=args.variant
+    )
+    return text_encoder_one, text_encoder_two
+def tokenize_prompt(tokenizer, prompt, max_sequence_length):
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=max_sequence_length,
+        truncation=True,
+        return_length=False,
+        return_overflowing_tokens=False,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids
+    return text_input_ids
+def tokenize_prompt_clip(tokenizer, prompt):
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=77,
+        truncation=True,
+        return_length=False,
+        return_overflowing_tokens=False,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids
+    return text_input_ids
+def tokenize_prompt_t5(tokenizer, prompt):
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=512,
+        truncation=True,
+        return_length=False,
+        return_overflowing_tokens=False,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids
+    return text_input_ids
+def _encode_prompt_with_t5(
+        text_encoder,
+        tokenizer,
+        max_sequence_length=512,
+        prompt=None,
+        num_images_per_prompt=1,
+        device=None,
+        text_input_ids=None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+    if tokenizer is not None:
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+    else:
+        if text_input_ids is None:
+            raise ValueError("text_input_ids must be provided when the tokenizer is not specified")
+    prompt_embeds = text_encoder(text_input_ids.to(device))[0]
+    dtype = text_encoder.dtype
+    prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+    _, seq_len, _ = prompt_embeds.shape
+    # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+    return prompt_embeds
+def _encode_prompt_with_clip(
+        text_encoder,
+        tokenizer,
+        prompt: str,
+        device=None,
+        text_input_ids=None,
+        num_images_per_prompt: int = 1,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+    if tokenizer is not None:
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=77,
+            truncation=True,
+            return_overflowing_tokens=False,
+            return_length=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+    else:
+        if text_input_ids is None:
+            raise ValueError("text_input_ids must be provided when the tokenizer is not specified")
+    prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=False)
+    # Use pooled output of CLIPTextModel
+    prompt_embeds = prompt_embeds.pooler_output
+    prompt_embeds = prompt_embeds.to(dtype=text_encoder.dtype, device=device)
+    # duplicate text embeddings for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+    return prompt_embeds
+def encode_prompt(
+        text_encoders,
+        tokenizers,
+        prompt: str,
+        max_sequence_length,
+        device=None,
+        num_images_per_prompt: int = 1,
+        text_input_ids_list=None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    dtype = text_encoders[0].dtype
+    pooled_prompt_embeds = _encode_prompt_with_clip(
+        text_encoder=text_encoders[0],
+        tokenizer=tokenizers[0],
+        prompt=prompt,
+        device=device if device is not None else text_encoders[0].device,
+        num_images_per_prompt=num_images_per_prompt,
+        text_input_ids=text_input_ids_list[0] if text_input_ids_list else None,
+    )
+    prompt_embeds = _encode_prompt_with_t5(
+        text_encoder=text_encoders[1],
+        tokenizer=tokenizers[1],
+        max_sequence_length=max_sequence_length,
+        prompt=prompt,
+        num_images_per_prompt=num_images_per_prompt,
+        device=device if device is not None else text_encoders[1].device,
+        text_input_ids=text_input_ids_list[1] if text_input_ids_list else None,
+    )
+    text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+    return prompt_embeds, pooled_prompt_embeds, text_ids
+def encode_token_ids(text_encoders, tokens, accelerator, num_images_per_prompt=1, device=None):
+    text_encoder_clip = text_encoders[0]
+    text_encoder_t5 = text_encoders[1]
+    tokens_clip, tokens_t5 = tokens[0], tokens[1]
+    batch_size = tokens_clip.shape[0]
+    if device == "cpu":
+        device = "cpu"
+    else:
+        device = accelerator.device
+    # clip
+    prompt_embeds = text_encoder_clip(tokens_clip.to(device), output_hidden_states=False)
+    # Use pooled output of CLIPTextModel
+    prompt_embeds = prompt_embeds.pooler_output
+    prompt_embeds = prompt_embeds.to(dtype=text_encoder_clip.dtype, device=accelerator.device)
+    # duplicate text embeddings for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    pooled_prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+    pooled_prompt_embeds = pooled_prompt_embeds.to(dtype=text_encoder_clip.dtype, device=accelerator.device)
+    # t5
+    prompt_embeds = text_encoder_t5(tokens_t5.to(device))[0]
+    dtype = text_encoder_t5.dtype
+    prompt_embeds = prompt_embeds.to(dtype=dtype, device=accelerator.device)
+    _, seq_len, _ = prompt_embeds.shape
+    # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+    text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=accelerator.device, dtype=dtype)
+    return prompt_embeds, pooled_prompt_embeds, text_ids

src/transformer_flux.py ADDED Viewed

	@@ -0,0 +1,583 @@

+from typing import Any, Dict, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import FluxTransformer2DLoadersMixin, FromOriginalModelMixin, PeftAdapterMixin
+from diffusers.models.attention import FeedForward
+from diffusers.models.attention_processor import (
+    Attention,
+    AttentionProcessor,
+    FluxAttnProcessor2_0,
+    FluxAttnProcessor2_0_NPU,
+    FusedFluxAttnProcessor2_0,
+)
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import AdaLayerNormContinuous, AdaLayerNormZero, AdaLayerNormZeroSingle
+from diffusers.utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
+from diffusers.utils.import_utils import is_torch_npu_available
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.embeddings import CombinedTimestepGuidanceTextProjEmbeddings, CombinedTimestepTextProjEmbeddings, FluxPosEmbed
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@maybe_allow_in_graph
+class FluxSingleTransformerBlock(nn.Module):
+    def __init__(self, dim, num_attention_heads, attention_head_dim, mlp_ratio=4.0):
+        super().__init__()
+        self.mlp_hidden_dim = int(dim * mlp_ratio)
+        self.norm = AdaLayerNormZeroSingle(dim)
+        self.proj_mlp = nn.Linear(dim, self.mlp_hidden_dim)
+        self.act_mlp = nn.GELU(approximate="tanh")
+        self.proj_out = nn.Linear(dim + self.mlp_hidden_dim, dim)
+        if is_torch_npu_available():
+            processor = FluxAttnProcessor2_0_NPU()
+        else:
+            processor = FluxAttnProcessor2_0()
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            bias=True,
+            processor=processor,
+            qk_norm="rms_norm",
+            eps=1e-6,
+            pre_only=True,
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cond_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        cond_temb: torch.Tensor,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> torch.Tensor:
+        use_cond = cond_hidden_states is not None
+        residual = hidden_states
+        norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
+        mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
+        if use_cond:
+            residual_cond = cond_hidden_states
+            norm_cond_hidden_states, cond_gate = self.norm(cond_hidden_states, emb=cond_temb)
+            mlp_cond_hidden_states = self.act_mlp(self.proj_mlp(norm_cond_hidden_states))
+        norm_hidden_states_concat = torch.concat([norm_hidden_states, norm_cond_hidden_states], dim=-2)
+        joint_attention_kwargs = joint_attention_kwargs or {}
+        attn_output = self.attn(
+            hidden_states=norm_hidden_states_concat,
+            image_rotary_emb=image_rotary_emb,
+            use_cond=use_cond,
+            **joint_attention_kwargs,
+        )
+        if use_cond:
+            attn_output, cond_attn_output = attn_output
+        hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
+        gate = gate.unsqueeze(1)
+        hidden_states = gate * self.proj_out(hidden_states)
+        hidden_states = residual + hidden_states
+        if use_cond:
+            condition_latents = torch.cat([cond_attn_output, mlp_cond_hidden_states], dim=2)
+            cond_gate = cond_gate.unsqueeze(1)
+            condition_latents = cond_gate * self.proj_out(condition_latents)
+            condition_latents = residual_cond + condition_latents
+        if hidden_states.dtype == torch.float16:
+            hidden_states = hidden_states.clip(-65504, 65504)
+        return hidden_states, condition_latents if use_cond else None
+@maybe_allow_in_graph
+class FluxTransformerBlock(nn.Module):
+    def __init__(
+        self, dim: int, num_attention_heads: int, attention_head_dim: int, qk_norm: str = "rms_norm", eps: float = 1e-6
+    ):
+        super().__init__()
+        self.norm1 = AdaLayerNormZero(dim)
+        self.norm1_context = AdaLayerNormZero(dim)
+        if hasattr(F, "scaled_dot_product_attention"):
+            processor = FluxAttnProcessor2_0()
+        else:
+            raise ValueError(
+                "The current PyTorch version does not support the `scaled_dot_product_attention` function."
+            )
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            added_kv_proj_dim=dim,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            context_pre_only=False,
+            bias=True,
+            processor=processor,
+            qk_norm=qk_norm,
+            eps=eps,
+        )
+        self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+        self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff_context = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cond_hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        cond_temb: torch.Tensor,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        use_cond = cond_hidden_states is not None
+        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
+        if use_cond:
+                (
+                    norm_cond_hidden_states,
+                    cond_gate_msa,
+                    cond_shift_mlp,
+                    cond_scale_mlp,
+                    cond_gate_mlp,
+                ) = self.norm1(cond_hidden_states, emb=cond_temb)
+        norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
+            encoder_hidden_states, emb=temb
+        )
+        norm_hidden_states = torch.concat([norm_hidden_states, norm_cond_hidden_states], dim=-2)
+        joint_attention_kwargs = joint_attention_kwargs or {}
+        # Attention.
+        attention_outputs = self.attn(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_encoder_hidden_states,
+            image_rotary_emb=image_rotary_emb,
+            use_cond=use_cond,
+            **joint_attention_kwargs,
+        )
+        attn_output, context_attn_output = attention_outputs[:2]
+        cond_attn_output = attention_outputs[2] if use_cond else None
+        # Process attention outputs for the `hidden_states`.
+        attn_output = gate_msa.unsqueeze(1) * attn_output
+        hidden_states = hidden_states + attn_output
+        if use_cond:
+            cond_attn_output = cond_gate_msa.unsqueeze(1) * cond_attn_output
+            cond_hidden_states = cond_hidden_states + cond_attn_output
+        norm_hidden_states = self.norm2(hidden_states)
+        norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        if use_cond:
+            norm_cond_hidden_states = self.norm2(cond_hidden_states)
+            norm_cond_hidden_states = (
+                norm_cond_hidden_states * (1 + cond_scale_mlp[:, None])
+                + cond_shift_mlp[:, None]
+            )
+        ff_output = self.ff(norm_hidden_states)
+        ff_output = gate_mlp.unsqueeze(1) * ff_output
+        hidden_states = hidden_states + ff_output
+        if use_cond:
+            cond_ff_output = self.ff(norm_cond_hidden_states)
+            cond_ff_output = cond_gate_mlp.unsqueeze(1) * cond_ff_output
+            cond_hidden_states = cond_hidden_states + cond_ff_output
+        # Process attention outputs for the `encoder_hidden_states`.
+        context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
+        encoder_hidden_states = encoder_hidden_states + context_attn_output
+        norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
+        norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
+        context_ff_output = self.ff_context(norm_encoder_hidden_states)
+        encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
+        if encoder_hidden_states.dtype == torch.float16:
+            encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
+        return encoder_hidden_states, hidden_states, cond_hidden_states if use_cond else None
+class FluxTransformer2DModel(
+    ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, FluxTransformer2DLoadersMixin
+):
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["FluxTransformerBlock", "FluxSingleTransformerBlock"]
+    @register_to_config
+    def __init__(
+        self,
+        patch_size: int = 1,
+        in_channels: int = 64,
+        out_channels: Optional[int] = None,
+        num_layers: int = 19,
+        num_single_layers: int = 38,
+        attention_head_dim: int = 128,
+        num_attention_heads: int = 24,
+        joint_attention_dim: int = 4096,
+        pooled_projection_dim: int = 768,
+        guidance_embeds: bool = False,
+        axes_dims_rope: Tuple[int] = (16, 56, 56),
+    ):
+        super().__init__()
+        self.out_channels = out_channels or in_channels
+        self.inner_dim = num_attention_heads * attention_head_dim
+        self.pos_embed = FluxPosEmbed(theta=10000, axes_dim=axes_dims_rope)
+        text_time_guidance_cls = (
+            CombinedTimestepGuidanceTextProjEmbeddings if guidance_embeds else CombinedTimestepTextProjEmbeddings
+        )
+        self.time_text_embed = text_time_guidance_cls(
+            embedding_dim=self.inner_dim, pooled_projection_dim=pooled_projection_dim
+        )
+        self.context_embedder = nn.Linear(joint_attention_dim, self.inner_dim)
+        self.x_embedder = nn.Linear(in_channels, self.inner_dim)
+        self.transformer_blocks = nn.ModuleList(
+            [
+                FluxTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.single_transformer_blocks = nn.ModuleList(
+            [
+                FluxSingleTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                )
+                for _ in range(num_single_layers)
+            ]
+        )
+        self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)
+        self.gradient_checkpointing = False
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections with FusedAttnProcessor2_0->FusedFluxAttnProcessor2_0
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        self.original_attn_processors = None
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+        self.original_attn_processors = self.attn_processors
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+        self.set_attn_processor(FusedFluxAttnProcessor2_0())
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cond_hidden_states: torch.Tensor = None,
+        encoder_hidden_states: torch.Tensor = None,
+        pooled_projections: torch.Tensor = None,
+        timestep: torch.LongTensor = None,
+        img_ids: torch.Tensor = None,
+        txt_ids: torch.Tensor = None,
+        guidance: torch.Tensor = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_block_samples=None,
+        controlnet_single_block_samples=None,
+        return_dict: bool = True,
+        controlnet_blocks_repeat: bool = False,
+    ) -> Union[torch.Tensor, Transformer2DModelOutput]:
+        if cond_hidden_states is not None:
+            use_condition = True
+        else:
+            use_condition = False
+        if joint_attention_kwargs is not None:
+            joint_attention_kwargs = joint_attention_kwargs.copy()
+            lora_scale = joint_attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+        hidden_states = self.x_embedder(hidden_states)
+        cond_hidden_states = self.x_embedder(cond_hidden_states)
+        timestep = timestep.to(hidden_states.dtype) * 1000
+        if guidance is not None:
+            guidance = guidance.to(hidden_states.dtype) * 1000
+        else:
+            guidance = None
+        temb = (
+            self.time_text_embed(timestep, pooled_projections)
+            if guidance is None
+            else self.time_text_embed(timestep, guidance, pooled_projections)
+        )
+        cond_temb = (
+            self.time_text_embed(torch.ones_like(timestep) * 0, pooled_projections)
+                if guidance is None
+                else self.time_text_embed(
+                    torch.ones_like(timestep) * 0, guidance, pooled_projections
+                )
+            )
+        encoder_hidden_states = self.context_embedder(encoder_hidden_states)
+        if txt_ids.ndim == 3:
+            logger.warning(
+                "Passing `txt_ids` 3d torch.Tensor is deprecated."
+                "Please remove the batch dimension and pass it as a 2d torch Tensor"
+            )
+            txt_ids = txt_ids[0]
+        if img_ids.ndim == 3:
+            logger.warning(
+                "Passing `img_ids` 3d torch.Tensor is deprecated."
+                "Please remove the batch dimension and pass it as a 2d torch Tensor"
+            )
+            img_ids = img_ids[0]
+        ids = torch.cat((txt_ids, img_ids), dim=0)
+        image_rotary_emb = self.pos_embed(ids)
+        if joint_attention_kwargs is not None and "ip_adapter_image_embeds" in joint_attention_kwargs:
+            ip_adapter_image_embeds = joint_attention_kwargs.pop("ip_adapter_image_embeds")
+            ip_hidden_states = self.encoder_hid_proj(ip_adapter_image_embeds)
+            joint_attention_kwargs.update({"ip_hidden_states": ip_hidden_states})
+        for index_block, block in enumerate(self.transformer_blocks):
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                encoder_hidden_states, hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    encoder_hidden_states,
+                    temb,
+                    image_rotary_emb,
+                    cond_temb=cond_temb if use_condition else None,
+                    cond_hidden_states=cond_hidden_states if use_condition else None,
+                    **ckpt_kwargs,
+                )
+            else:
+                encoder_hidden_states, hidden_states, cond_hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cond_hidden_states=cond_hidden_states if use_condition else None,
+                    temb=temb,
+                    cond_temb=cond_temb if use_condition else None,
+                    image_rotary_emb=image_rotary_emb,
+                    joint_attention_kwargs=joint_attention_kwargs,
+                )
+            # controlnet residual
+            if controlnet_block_samples is not None:
+                interval_control = len(self.transformer_blocks) / len(controlnet_block_samples)
+                interval_control = int(np.ceil(interval_control))
+                # For Xlabs ControlNet.
+                if controlnet_blocks_repeat:
+                    hidden_states = (
+                        hidden_states + controlnet_block_samples[index_block % len(controlnet_block_samples)]
+                    )
+                else:
+                    hidden_states = hidden_states + controlnet_block_samples[index_block // interval_control]
+        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+        for index_block, block in enumerate(self.single_transformer_blocks):
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states, cond_hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    temb,
+                    image_rotary_emb,
+                    cond_temb=cond_temb if use_condition else None,
+                    cond_hidden_states=cond_hidden_states if use_condition else None,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states, cond_hidden_states = block(
+                    hidden_states=hidden_states,
+                    cond_hidden_states=cond_hidden_states if use_condition else None,
+                    temb=temb,
+                    cond_temb=cond_temb if use_condition else None,
+                    image_rotary_emb=image_rotary_emb,
+                    joint_attention_kwargs=joint_attention_kwargs,
+                )
+            # controlnet residual
+            if controlnet_single_block_samples is not None:
+                interval_control = len(self.single_transformer_blocks) / len(controlnet_single_block_samples)
+                interval_control = int(np.ceil(interval_control))
+                hidden_states[:, encoder_hidden_states.shape[1] :, ...] = (
+                    hidden_states[:, encoder_hidden_states.shape[1] :, ...]
+                    + controlnet_single_block_samples[index_block // interval_control]
+                )
+        hidden_states = hidden_states[:, encoder_hidden_states.shape[1] :, ...]
+        hidden_states = self.norm_out(hidden_states, temb)
+        output = self.proj_out(hidden_states)
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)