ACE-Plus

Running on Zero

App Files Files Community

chaojiemao commited on Jan 7

Commit

83266af

1 Parent(s): 9cee51c

modify old files

Browse files

Files changed (12) hide show

__init__.py +0 -1
ace_flux_inference.py +0 -329
app.py +0 -1428
config/chatbot_ui.yaml +0 -25
config/models/ace_flux_dev.yaml +0 -187
example.py +0 -370
models/__init__.py +0 -2
models/embedder.py +0 -383
models/flux.py +0 -798
models/layers.py +0 -497
requirements.txt +0 -8
utils.py +0 -95

__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from . import models

ace_flux_inference.py DELETED Viewed

@@ -1,329 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import math
-import os
-import random
-import numpy as np
-import torch
-import torch.nn.functional as F
-from PIL import Image
-import torchvision.transforms as T
-from scepter.modules.model.registry import DIFFUSIONS, BACKBONES
-import torchvision.transforms.functional as TF
-from scepter.modules.model.utils.basic_utils import check_list_of_list
-from scepter.modules.model.utils.basic_utils import \
-    pack_imagelist_into_tensor_v2 as pack_imagelist_into_tensor
-from scepter.modules.model.utils.basic_utils import (
-    to_device, unpack_tensor_into_imagelist)
-from scepter.modules.utils.distribute import we
-from scepter.modules.utils.file_system import FS
-from scepter.modules.utils.logger import get_logger
-from scepter.modules.inference.diffusion_inference import DiffusionInference, get_model
-def process_edit_image(images,
-                       masks,
-                       tasks):
-    if not isinstance(images, list):
-        images = [images]
-    if not isinstance(masks, list):
-        masks = [masks]
-    if not isinstance(tasks, list):
-        tasks = [tasks]
-    img_tensors = []
-    mask_tensors = []
-    for img, mask, task in zip(images, masks, tasks):
-        if mask is None or mask == '':
-            mask = Image.new('L', img.size, 0)
-        img = TF.center_crop(img, [512, 512])
-        mask = TF.center_crop(mask, [512, 512])
-        mask = np.asarray(mask)
-        mask = np.where(mask > 128, 1, 0)
-        mask = mask.astype(
-            np.float32) if np.any(mask) else np.ones_like(mask).astype(
-                np.float32)
-        img_tensor = TF.to_tensor(img).to(we.device_id)
-        img_tensor = TF.normalize(img_tensor,
-                                  mean=[0.5, 0.5, 0.5],
-                                  std=[0.5, 0.5, 0.5])
-        mask_tensor = TF.to_tensor(mask).to(we.device_id)
-        if task in ['inpainting', 'Try On', 'Inpainting']:
-            mask_indicator = mask_tensor.repeat(3, 1, 1)
-            img_tensor[mask_indicator == 1] = -1.0
-        img_tensors.append(img_tensor)
-        mask_tensors.append(mask_tensor)
-    return img_tensors, mask_tensors
-class FluxACEInference(DiffusionInference):
-    def __init__(self, logger=None):
-        if logger is None:
-            logger = get_logger(name='scepter')
-        self.logger = logger
-        self.loaded_model = {}
-        self.loaded_model_name = [
-            'diffusion_model', 'first_stage_model', 'cond_stage_model', 'ref_cond_stage_model'
-        ]
-    def init_from_cfg(self, cfg):
-        self.name = cfg.NAME
-        self.is_default = cfg.get('IS_DEFAULT', False)
-        self.use_dynamic_model = cfg.get('USE_DYNAMIC_MODEL', True)
-        module_paras = self.load_default(cfg.get('DEFAULT_PARAS', None))
-        assert cfg.have('MODEL')
-        self.size_factor = cfg.get('SIZE_FACTOR', 8)
-        self.diffusion_model = self.infer_model(
-            cfg.MODEL.DIFFUSION_MODEL, module_paras.get(
-                'DIFFUSION_MODEL',
-                None)) if cfg.MODEL.have('DIFFUSION_MODEL') else None
-        self.first_stage_model = self.infer_model(
-            cfg.MODEL.FIRST_STAGE_MODEL,
-            module_paras.get(
-                'FIRST_STAGE_MODEL',
-                None)) if cfg.MODEL.have('FIRST_STAGE_MODEL') else None
-        self.cond_stage_model = self.infer_model(
-            cfg.MODEL.COND_STAGE_MODEL,
-            module_paras.get(
-                'COND_STAGE_MODEL',
-                None)) if cfg.MODEL.have('COND_STAGE_MODEL') else None
-        self.ref_cond_stage_model = self.infer_model(
-            cfg.MODEL.REF_COND_STAGE_MODEL,
-            module_paras.get(
-                'REF_COND_STAGE_MODEL',
-                None)) if cfg.MODEL.have('REF_COND_STAGE_MODEL') else None
-        self.diffusion = DIFFUSIONS.build(cfg.MODEL.DIFFUSION,
-                                          logger=self.logger)
-        self.interpolate_func = lambda x: (F.interpolate(
-            x.unsqueeze(0),
-            scale_factor=1 / self.size_factor,
-            mode='nearest-exact') if x is not None else None)
-        self.max_seq_length = cfg.get("MAX_SEQ_LENGTH", 4096)
-        if not self.use_dynamic_model:
-            self.dynamic_load(self.first_stage_model, 'first_stage_model')
-            self.dynamic_load(self.cond_stage_model, 'cond_stage_model')
-            if self.ref_cond_stage_model is not None: self.dynamic_load(self.ref_cond_stage_model, 'ref_cond_stage_model')
-            with torch.device("meta"):
-                pretrained_model = self.diffusion_model['cfg'].PRETRAINED_MODEL
-                self.diffusion_model['cfg'].PRETRAINED_MODEL = None
-                diffusers_lora = self.diffusion_model['cfg'].get("DIFFUSERS_LORA_MODEL", None)
-                self.diffusion_model['cfg'].DIFFUSERS_LORA_MODEL = None
-                swift_lora = self.diffusion_model['cfg'].get("SWIFT_LORA_MODEL", None)
-                self.diffusion_model['cfg'].SWIFT_LORA_MODEL = None
-                pretrain_adapter = self.diffusion_model['cfg'].get("PRETRAIN_ADAPTER", None)
-                self.diffusion_model['cfg'].PRETRAIN_ADAPTER = None
-                blackforest_lora = self.diffusion_model['cfg'].get("BLACKFOREST_LORA_MODEL", None)
-                self.diffusion_model['cfg'].BLACKFOREST_LORA_MODEL = None
-                self.diffusion_model['model'] = BACKBONES.build(self.diffusion_model['cfg'], logger=self.logger).eval()
-            # self.dynamic_load(self.diffusion_model, 'diffusion_model')
-            self.diffusion_model['model'].lora_model = diffusers_lora
-            self.diffusion_model['model'].swift_lora_model = swift_lora
-            self.diffusion_model['model'].pretrain_adapter = pretrain_adapter
-            self.diffusion_model['model'].blackforest_lora_model = blackforest_lora
-            self.diffusion_model['model'].load_pretrained_model(pretrained_model)
-            self.diffusion_model['device'] = we.device_id
-    def upscale_resize(self, image, interpolation=T.InterpolationMode.BILINEAR):
-        c, H, W = image.shape
-        scale = max(1.0, math.sqrt(self.max_seq_length / ((H / 16) * (W / 16))))
-        rH = int(H * scale) // 16 * 16  # ensure divisible by self.d
-        rW = int(W * scale) // 16 * 16
-        image = T.Resize((rH, rW), interpolation=interpolation, antialias=True)(image)
-        return image
-    @torch.no_grad()
-    def encode_first_stage(self, x, **kwargs):
-        _, dtype = self.get_function_info(self.first_stage_model, 'encode')
-        with torch.autocast('cuda',
-                            enabled=dtype in ('float16', 'bfloat16'),
-                            dtype=getattr(torch, dtype)):
-            def run_one_image(u):
-                zu = get_model(self.first_stage_model).encode(u)
-                if isinstance(zu, (tuple, list)):
-                    zu = zu[0]
-                return zu
-            z = [run_one_image(u.unsqueeze(0) if u.dim() == 3 else u) for u in x]
-            return z
-    @torch.no_grad()
-    def decode_first_stage(self, z):
-        _, dtype = self.get_function_info(self.first_stage_model, 'decode')
-        with torch.autocast('cuda',
-                            enabled=dtype in ('float16', 'bfloat16'),
-                            dtype=getattr(torch, dtype)):
-            return [get_model(self.first_stage_model).decode(zu) for zu in z]
-    def noise_sample(self, num_samples, h, w, seed, device = None, dtype = torch.bfloat16):
-        noise = torch.randn(
-            num_samples,
-            16,
-            # allow for packing
-            2 * math.ceil(h / 16),
-            2 * math.ceil(w / 16),
-            device="cpu",
-            dtype=dtype,
-            generator=torch.Generator().manual_seed(seed),
-        ).to(device)
-        return noise
-    @torch.no_grad()
-    def __call__(self,
-                 image=None,
-                 mask=None,
-                 prompt='',
-                 task=None,
-                 negative_prompt='',
-                 output_height=1024,
-                 output_width=1024,
-                 sampler='flow_euler',
-                 sample_steps=20,
-                 guide_scale=3.5,
-                 seed=-1,
-                 history_io=None,
-                 tar_index=0,
-                 # align=0,
-                 **kwargs):
-        input_image, input_mask = image, mask
-        seed = seed if seed >= 0 else random.randint(0, 2**32 - 1)
-        if input_image is not None:
-            # assert isinstance(input_image, list) and isinstance(input_mask, list)
-            if task is None:
-                task = [''] * len(input_image)
-            if not isinstance(prompt, list):
-                prompt = [prompt] * len(input_image)
-            prompt = [
-                pp.replace('{image}', f'{{image{i}}}') if i > 0 else pp
-                for i, pp in enumerate(prompt)
-            ]
-            edit_image, edit_image_mask = process_edit_image(
-                input_image, input_mask, task)
-            image = torch.zeros(
-                size=[3, int(output_height),
-                      int(output_width)])
-            image_mask = torch.ones(
-                size=[1, int(output_height),
-                      int(output_width)])
-            edit_image, edit_image_mask = [edit_image], [edit_image_mask]
-        else:
-            edit_image = edit_image_mask = [[]]
-            image = torch.zeros(
-                size=[3, int(output_height),
-                      int(output_width)])
-            image_mask = torch.ones(
-                size=[1, int(output_height),
-                      int(output_width)])
-            if not isinstance(prompt, list):
-                prompt = [prompt]
-        align = 0
-        image, image_mask, prompt = [image], [image_mask], [prompt],
-        align = [align for p in prompt] if isinstance(align, int) else align
-        assert check_list_of_list(prompt) and check_list_of_list(
-            edit_image) and check_list_of_list(edit_image_mask)
-        # negative prompt is not used
-        image = to_device(image)
-        ctx = {}
-        # Get Noise Shape
-        self.dynamic_load(self.first_stage_model, 'first_stage_model')
-        x = self.encode_first_stage(image)
-        self.dynamic_unload(self.first_stage_model,
-                            'first_stage_model',
-                            skip_loaded=not self.use_dynamic_model)
-        g = torch.Generator(device=we.device_id).manual_seed(seed)
-        noise = [
-            torch.randn((1, 16, i.shape[2], i.shape[3]), device=we.device_id, dtype=torch.bfloat16).normal_(generator=g)
-            for i in x
-        ]
-        # import pdb;pdb.set_trace()
-        noise, x_shapes = pack_imagelist_into_tensor(noise)
-        ctx['x_shapes'] = x_shapes
-        ctx['align'] = align
-        image_mask = to_device(image_mask, strict=False)
-        cond_mask = [self.interpolate_func(i) for i in image_mask
-                     ] if image_mask is not None else [None] * len(image)
-        ctx['x_mask'] = cond_mask
-        # Encode Prompt
-        instruction_prompt = [[pp[-1]] if "{image}" in pp[-1] else ["{image} " + pp[-1]] for pp in prompt]
-        self.dynamic_load(self.cond_stage_model, 'cond_stage_model')
-        function_name, dtype = self.get_function_info(self.cond_stage_model)
-        cont = getattr(get_model(self.cond_stage_model), function_name)(instruction_prompt)
-        cont["context"] = [ct[-1] for ct in cont["context"]]
-        cont["y"] = [ct[-1] for ct in cont["y"]]
-        self.dynamic_unload(self.cond_stage_model,
-                            'cond_stage_model',
-                            skip_loaded=not self.use_dynamic_model)
-        ctx.update(cont)
-        # Encode Edit Images
-        self.dynamic_load(self.first_stage_model, 'first_stage_model')
-        edit_image = [to_device(i, strict=False) for i in edit_image]
-        edit_image_mask = [to_device(i, strict=False) for i in edit_image_mask]
-        e_img, e_mask = [], []
-        for u, m in zip(edit_image, edit_image_mask):
-            if u is None:
-                continue
-            if m is None:
-                m = [None] * len(u)
-            e_img.append(self.encode_first_stage(u, **kwargs))
-            e_mask.append([self.interpolate_func(i) for i in m])
-        self.dynamic_unload(self.first_stage_model,
-                            'first_stage_model',
-                            skip_loaded=not self.use_dynamic_model)
-        ctx['edit'] = e_img
-        ctx['edit_mask'] = e_mask
-        # Encode Ref Images
-        if guide_scale is not None:
-            guide_scale = torch.full((noise.shape[0],), guide_scale, device=noise.device, dtype=noise.dtype)
-        else:
-            guide_scale = None
-        # Diffusion Process
-        self.dynamic_load(self.diffusion_model, 'diffusion_model')
-        function_name, dtype = self.get_function_info(self.diffusion_model)
-        with torch.autocast('cuda',
-                            enabled=dtype in ('float16', 'bfloat16'),
-                            dtype=getattr(torch, dtype)):
-            latent = self.diffusion.sample(
-                noise=noise,
-                sampler=sampler,
-                model=get_model(self.diffusion_model),
-                model_kwargs={
-                    "cond": ctx, "guidance": guide_scale, "gc_seg": -1
-                },
-                steps=sample_steps,
-                show_progress=True,
-                guide_scale=guide_scale,
-                return_intermediate=None,
-                reverse_scale=-1,
-                **kwargs).float()
-        if self.use_dynamic_model: self.dynamic_unload(self.diffusion_model,
-                            'diffusion_model',
-                            skip_loaded=not self.use_dynamic_model)
-        # Decode to Pixel Space
-        self.dynamic_load(self.first_stage_model, 'first_stage_model')
-        samples = unpack_tensor_into_imagelist(latent, x_shapes)
-        x_samples = self.decode_first_stage(samples)
-        self.dynamic_unload(self.first_stage_model,
-                            'first_stage_model',
-                            skip_loaded=not self.use_dynamic_model)
-        x_samples = [x.squeeze(0) for x in x_samples]
-        imgs = [
-            torch.clamp((x_i.float() + 1.0) / 2.0,
-                        min=0.0,
-                        max=1.0).squeeze(0).permute(1, 2, 0).cpu().numpy()
-            for x_i in x_samples
-        ]
-        imgs = [Image.fromarray((img * 255).astype(np.uint8)) for img in imgs]
-        return imgs

app.py DELETED Viewed

@@ -1,1428 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import base64
-import copy
-import glob
-import io
-import os, csv, sys
-import random
-import re
-import shlex
-import string
-import subprocess
-import threading
-import spaces
-subprocess.run("rm -rf /data-nvme/zerogpu-offload/*", env={}, shell=True)
-#subprocess.run(shlex.split('pip install flash-attn --no-build-isolation'),
-#               env=os.environ | {'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"})
-subprocess.run(shlex.split('pip install scepter'))
-import cv2
-import gradio as gr
-import numpy as np
-import torch
-import transformers
-from PIL import Image
-from transformers import AutoModel, AutoTokenizer
-from ace_flux_inference import FluxACEInference
-from scepter.modules.utils.config import Config
-from scepter.modules.utils.directory import get_md5
-from scepter.modules.utils.file_system import FS
-from scepter.studio.utils.env import init_env
-from importlib.metadata import version
-from example import get_examples
-from utils import load_image
-from huggingface_hub import login
-login(token=os.environ.get("HF_TOKEN", ""))
-csv.field_size_limit(sys.maxsize)
-refresh_sty = '\U0001f504'  # 🔄
-clear_sty = '\U0001f5d1'  # 🗑️
-upload_sty = '\U0001f5bc'  # 🖼️
-sync_sty = '\U0001f4be'  # 💾
-chat_sty = '\U0001F4AC'  # 💬
-video_sty = '\U0001f3a5'  # 🎥
-lock = threading.Lock()
-inference_dict = {
-    "ACE_FLUX": FluxACEInference,
-}
-class ChatBotUI(object):
-    def __init__(self,
-                 cfg_general_file,
-                 is_debug=False,
-                 language='en',
-                 root_work_dir='./'):
-        try:
-            from diffusers import CogVideoXImageToVideoPipeline
-            from diffusers.utils import export_to_video
-        except Exception as e:
-            print(f"Import diffusers failed, please install or upgrade diffusers. Error information: {e}")
-        cfg = Config(cfg_file=cfg_general_file)
-        if cfg.have("FILE_SYSTEM"):
-            for file_sys in cfg.FILE_SYSTEM:
-                fs_prefix = FS.init_fs_client(file_sys)
-        else:
-            fs_prefix = FS.init_fs_client(cfg)
-        cfg.WORK_DIR = os.path.join(root_work_dir, cfg.WORK_DIR)
-        if not FS.exists(cfg.WORK_DIR):
-            FS.make_dir(cfg.WORK_DIR)
-        cfg = init_env(cfg)
-        self.cache_dir = cfg.WORK_DIR
-        self.chatbot_examples = get_examples(self.cache_dir) if not cfg.get('SKIP_EXAMPLES', False) else []
-        self.model_cfg_dir = cfg.MODEL.EDIT_MODEL.MODEL_CFG_DIR
-        self.model_yamls = glob.glob(os.path.join(self.model_cfg_dir,
-                                                  '*.yaml'))
-        self.model_choices = dict()
-        self.default_model_name = ''
-        for i in self.model_yamls:
-            model_cfg = Config(load=True, cfg_file=i)
-            model_name = model_cfg.NAME
-            if model_cfg.IS_DEFAULT: self.default_model_name = model_name
-            self.model_choices[model_name] = model_cfg
-        print('Models: ', self.model_choices.keys())
-        local_folder = FS.get_dir_to_local_dir("hf://black-forest-labs/FLUX.1-dev")
-        subprocess.run(shlex.split(f'rm -rf {local_folder}/transformer'))
-        subprocess.run(shlex.split(f'rm -rf {local_folder}/vae'))
-        subprocess.run(shlex.split(f'rm -rf {local_folder}/flux1-dev.safetensors'))
-        assert len(self.model_choices) > 0
-        if self.default_model_name == "": self.default_model_name = list(self.model_choices.keys())[0]
-        self.model_name = self.default_model_name
-        pipe_cfg = self.model_choices[self.default_model_name]
-        infer_name = pipe_cfg.get("INFERENCE_TYPE", "ACE")
-        self.pipe = inference_dict[infer_name]()
-        self.pipe.init_from_cfg(pipe_cfg)
-        self.max_msgs = 20
-        self.enable_i2v = cfg.get('ENABLE_I2V', False)
-        self.gradio_version = version('gradio')
-        if self.enable_i2v:
-            self.i2v_model_dir = cfg.MODEL.I2V.MODEL_DIR
-            self.i2v_model_name = cfg.MODEL.I2V.MODEL_NAME
-            if self.i2v_model_name == 'CogVideoX-5b-I2V':
-                with FS.get_dir_to_local_dir(self.i2v_model_dir) as local_dir:
-                    self.i2v_pipe = CogVideoXImageToVideoPipeline.from_pretrained(
-                        local_dir, torch_dtype=torch.bfloat16).cuda()
-            else:
-                raise NotImplementedError
-            with FS.get_dir_to_local_dir(
-                    cfg.MODEL.CAPTIONER.MODEL_DIR) as local_dir:
-                self.captioner = AutoModel.from_pretrained(
-                    local_dir,
-                    torch_dtype=torch.bfloat16,
-                    low_cpu_mem_usage=True,
-                    use_flash_attn=True,
-                    trust_remote_code=True).eval().cuda()
-                self.llm_tokenizer = AutoTokenizer.from_pretrained(
-                    local_dir, trust_remote_code=True, use_fast=False)
-                self.llm_generation_config = dict(max_new_tokens=1024,
-                                                  do_sample=True)
-                self.llm_prompt = cfg.LLM.PROMPT
-                self.llm_max_num = 2
-            with FS.get_dir_to_local_dir(
-                    cfg.MODEL.ENHANCER.MODEL_DIR) as local_dir:
-                self.enhancer = transformers.pipeline(
-                    'text-generation',
-                    model=local_dir,
-                    model_kwargs={'torch_dtype': torch.bfloat16},
-                    device_map='auto',
-                )
-            sys_prompt = """You are part of a team of bots that creates videos. You work with an assistant bot that will draw anything you say in square brackets.
-            For example , outputting " a beautiful morning in the woods with the sun peaking through the trees " will trigger your partner bot to output an video of a forest morning , as described. You will be prompted by people looking to create detailed , amazing videos. The way to accomplish this is to take their short prompts and make them extremely detailed and descriptive.
-            There are a few rules to follow:
-            You will only ever output a single video description per user request.
-            When modifications are requested , you should not simply make the description longer . You should refactor the entire description to integrate the suggestions.
-            Other times the user will not want modifications , but instead want a new image . In this case , you should ignore your previous conversation with the user.
-            Video descriptions must have the same num of words as examples below. Extra words will be ignored.
-            """
-            self.enhance_ctx = [
-                {
-                    'role': 'system',
-                    'content': sys_prompt
-                },
-                {
-                    'role':
-                        'user',
-                    'content':
-                        'Create an imaginative video descriptive caption or modify an earlier caption for the user input : "a girl is on the beach"',
-                },
-                {
-                    'role':
-                        'assistant',
-                    'content':
-                        "A radiant woman stands on a deserted beach, arms outstretched, wearing a beige trench coat, white blouse, light blue jeans, and chic boots, against a backdrop of soft sky and sea. Moments later, she is seen mid-twirl, arms exuberant, with the lighting suggesting dawn or dusk. Then, she runs along the beach, her attire complemented by an off-white scarf and black ankle boots, the tranquil sea behind her. Finally, she holds a paper airplane, her pose reflecting joy and freedom, with the ocean's gentle waves and the sky's soft pastel hues enhancing the serene ambiance.",
-                },
-                {
-                    'role':
-                        'user',
-                    'content':
-                        'Create an imaginative video descriptive caption or modify an earlier caption for the user input : "A man jogging on a football field"',
-                },
-                {
-                    'role':
-                        'assistant',
-                    'content':
-                        "A determined man in athletic attire, including a blue long-sleeve shirt, black shorts, and blue socks, jogs around a snow-covered soccer field, showcasing his solitary exercise in a quiet, overcast setting. His long dreadlocks, focused expression, and the serene winter backdrop highlight his dedication to fitness. As he moves, his attire, consisting of a blue sports sweatshirt, black athletic pants, gloves, and sneakers, grips the snowy ground. He is seen running past a chain-link fence enclosing the playground area, with a basketball hoop and children's slide, suggesting a moment of solitary exercise amidst the empty field.",
-                },
-                {
-                    'role':
-                        'user',
-                    'content':
-                        'Create an imaginative video descriptive caption or modify an earlier caption for the user input : " A woman is dancing, HD footage, close-up"',
-                },
-                {
-                    'role':
-                        'assistant',
-                    'content':
-                        'A young woman with her hair in an updo and wearing a teal hoodie stands against a light backdrop, initially looking over her shoulder with a contemplative expression. She then confidently makes a subtle dance move, suggesting rhythm and movement. Next, she appears poised and focused, looking directly at the camera. Her expression shifts to one of introspection as she gazes downward slightly. Finally, she dances with confidence, her left hand over her heart, symbolizing a poignant moment, all while dressed in the same teal hoodie against a plain, light-colored background.',
-                },
-            ]
-    def create_ui(self):
-        css = '.chatbot.prose.md {opacity: 1.0 !important} #chatbot {opacity: 1.0 !important}'
-        with gr.Blocks(css=css,
-                       title='Chatbot',
-                       head='Chatbot',
-                       analytics_enabled=False):
-            self.history = gr.State(value=[])
-            self.images = gr.State(value={})
-            self.history_result = gr.State(value={})
-            self.retry_msg = gr.State(value='')
-            with gr.Group():
-                self.ui_mode = gr.State(value='legacy')
-                with gr.Row(equal_height=True, visible=False) as self.chat_group:
-                    with gr.Column(visible=True) as self.chat_page:
-                        self.chatbot = gr.Chatbot(
-                            height=600,
-                            value=[],
-                            bubble_full_width=False,
-                            show_copy_button=True,
-                            container=False,
-                            placeholder='<strong>Chat Box</strong>')
-                        with gr.Row():
-                            self.clear_btn = gr.Button(clear_sty +
-                                                       ' Clear Chat',
-                                                       size='sm')
-                    with gr.Column(visible=False) as self.editor_page:
-                        with gr.Tabs(visible=False) as self.upload_tabs:
-                            with gr.Tab(id='ImageUploader',
-                                        label='Image Uploader',
-                                        visible=True) as self.upload_tab:
-                                self.image_uploader = gr.Image(
-                                    height=550,
-                                    interactive=True,
-                                    type='pil',
-                                    image_mode='RGB',
-                                    sources=['upload'],
-                                    elem_id='image_uploader',
-                                    format='png')
-                                with gr.Row():
-                                    self.sub_btn_1 = gr.Button(
-                                        value='Submit',
-                                        elem_id='upload_submit')
-                                    self.ext_btn_1 = gr.Button(value='Exit')
-                        with gr.Tabs(visible=False) as self.edit_tabs:
-                            with gr.Tab(id='ImageEditor',
-                                        label='Image Editor') as self.edit_tab:
-                                self.mask_type = gr.Dropdown(
-                                    label='Mask Type',
-                                    choices=[
-                                        'Background', 'Composite',
-                                        'Outpainting'
-                                    ],
-                                    value='Background')
-                                self.mask_type_info = gr.HTML(
-                                    value=
-                                    "<div style='background-color: white; padding-left: 15px; color: grey;'>Background mode will not erase the visual content in the mask area</div>"
-                                )
-                                with gr.Accordion(
-                                        label='Outpainting Setting',
-                                        open=True,
-                                        visible=False) as self.outpaint_tab:
-                                    with gr.Row(variant='panel'):
-                                        self.top_ext = gr.Slider(
-                                            show_label=True,
-                                            label='Top Extend Ratio',
-                                            minimum=0.0,
-                                            maximum=2.0,
-                                            step=0.1,
-                                            value=0.25)
-                                        self.bottom_ext = gr.Slider(
-                                            show_label=True,
-                                            label='Bottom Extend Ratio',
-                                            minimum=0.0,
-                                            maximum=2.0,
-                                            step=0.1,
-                                            value=0.25)
-                                    with gr.Row(variant='panel'):
-                                        self.left_ext = gr.Slider(
-                                            show_label=True,
-                                            label='Left Extend Ratio',
-                                            minimum=0.0,
-                                            maximum=2.0,
-                                            step=0.1,
-                                            value=0.25)
-                                        self.right_ext = gr.Slider(
-                                            show_label=True,
-                                            label='Right Extend Ratio',
-                                            minimum=0.0,
-                                            maximum=2.0,
-                                            step=0.1,
-                                            value=0.25)
-                                    with gr.Row(variant='panel'):
-                                        self.img_pad_btn = gr.Button(
-                                            value='Pad Image')
-                                self.image_editor = gr.ImageMask(
-                                    value=None,
-                                    sources=[],
-                                    layers=False,
-                                    label='Edit Image',
-                                    elem_id='image_editor',
-                                    format='png')
-                                with gr.Row():
-                                    self.sub_btn_2 = gr.Button(
-                                        value='Submit', elem_id='edit_submit')
-                                    self.ext_btn_2 = gr.Button(value='Exit')
-                            with gr.Tab(id='ImageViewer',
-                                        label='Image Viewer') as self.image_view_tab:
-                                if self.gradio_version >= '5.0.0':
-                                    self.image_viewer = gr.Image(
-                                        label='Image',
-                                        type='pil',
-                                        show_download_button=True,
-                                        elem_id='image_viewer')
-                                else:
-                                    try:
-                                        from gradio_imageslider import ImageSlider
-                                    except Exception as e:
-                                        print(f"Import gradio_imageslider failed, please install.")
-                                    self.image_viewer = ImageSlider(
-                                        label='Image',
-                                        type='pil',
-                                        show_download_button=True,
-                                        elem_id='image_viewer')
-                                self.ext_btn_3 = gr.Button(value='Exit')
-                            with gr.Tab(id='VideoViewer',
-                                        label='Video Viewer',
-                                        visible=False) as self.video_view_tab:
-                                self.video_viewer = gr.Video(
-                                    label='Video',
-                                    interactive=False,
-                                    sources=[],
-                                    format='mp4',
-                                    show_download_button=True,
-                                    elem_id='video_viewer',
-                                    loop=True,
-                                    autoplay=True)
-                                self.ext_btn_4 = gr.Button(value='Exit')
-                with gr.Row(equal_height=True, visible=True) as self.legacy_group:
-                    with gr.Column():
-                        self.legacy_image_uploader = gr.Image(
-                            height=550,
-                            interactive=True,
-                            type='pil',
-                            image_mode='RGB',
-                            elem_id='legacy_image_uploader',
-                            format='png')
-                    with gr.Column():
-                        self.legacy_image_viewer = gr.Image(
-                            label='Image',
-                            height=550,
-                            type='pil',
-                            interactive=False,
-                            show_download_button=True,
-                            elem_id='image_viewer')
-                with gr.Accordion(label='Setting', open=False):
-                    with gr.Row():
-                        self.model_name_dd = gr.Dropdown(
-                            choices=self.model_choices,
-                            value=self.default_model_name,
-                            label='Model Version')
-                    with gr.Row():
-                        self.negative_prompt = gr.Textbox(
-                            value='',
-                            placeholder=
-                            'Negative prompt used for Classifier-Free Guidance',
-                            label='Negative Prompt',
-                            container=False)
-                    with gr.Row():
-                        # REFINER_PROMPT
-                        self.refiner_prompt = gr.Textbox(
-                            value=self.pipe.input.get("refiner_prompt", ""),
-                            visible=self.pipe.input.get("refiner_prompt", None) is not None,
-                            placeholder=
-                            'Prompt used for refiner',
-                            label='Refiner Prompt',
-                            container=False)
-                    with gr.Row():
-                        with gr.Column(scale=8, min_width=500):
-                            with gr.Row():
-                                self.step = gr.Slider(minimum=1,
-                                                      maximum=1000,
-                                                      value=self.pipe.input.get("sample_steps", 20),
-                                                      visible=self.pipe.input.get("sample_steps", None) is not None,
-                                                      label='Sample Step')
-                                self.cfg_scale = gr.Slider(
-                                    minimum=1.0,
-                                    maximum=20.0,
-                                    value=self.pipe.input.get("guide_scale", 4.5),
-                                    visible=self.pipe.input.get("guide_scale", None) is not None,
-                                    label='Guidance Scale')
-                                self.rescale = gr.Slider(minimum=0.0,
-                                                         maximum=1.0,
-                                                         value=self.pipe.input.get("guide_rescale", 0.5),
-                                                         visible=self.pipe.input.get("guide_rescale", None) is not None,
-                                                         label='Rescale')
-                                self.refiner_scale = gr.Slider(minimum=-0.1,
-                                                               maximum=1.0,
-                                                               value=self.pipe.input.get("refiner_scale", -1),
-                                                               visible=self.pipe.input.get("refiner_scale",
-                                                                                           None) is not None,
-                                                               label='Refiner Scale')
-                                self.seed = gr.Slider(minimum=-1,
-                                                      maximum=10000000,
-                                                      value=-1,
-                                                      label='Seed')
-                                self.output_height = gr.Slider(
-                                    minimum=256,
-                                    maximum=1440,
-                                    value=self.pipe.input.get("output_height", 1024),
-                                    visible=self.pipe.input.get("output_height", None) is not None,
-                                    label='Output Height')
-                                self.output_width = gr.Slider(
-                                    minimum=256,
-                                    maximum=1440,
-                                    value=self.pipe.input.get("output_width", 1024),
-                                    visible=self.pipe.input.get("output_width", None) is not None,
-                                    label='Output Width')
-                        with gr.Column(scale=1, min_width=50):
-                            self.use_history = gr.Checkbox(value=False,
-                                                           label='Use History')
-                            self.use_ace = gr.Checkbox(value=self.pipe.input.get("use_ace", True),
-                                                       visible=self.pipe.input.get("use_ace", None) is not None,
-                                                       label='Use ACE')
-                            self.video_auto = gr.Checkbox(
-                                value=False,
-                                label='Auto Gen Video',
-                                visible=self.enable_i2v)
-                    with gr.Row(variant='panel',
-                                equal_height=True,
-                                visible=self.enable_i2v):
-                        self.video_fps = gr.Slider(minimum=1,
-                                                   maximum=16,
-                                                   value=8,
-                                                   label='Video FPS',
-                                                   visible=True)
-                        self.video_frames = gr.Slider(minimum=8,
-                                                      maximum=49,
-                                                      value=49,
-                                                      label='Video Frame Num',
-                                                      visible=True)
-                        self.video_step = gr.Slider(minimum=1,
-                                                    maximum=1000,
-                                                    value=50,
-                                                    label='Video Sample Step',
-                                                    visible=True)
-                        self.video_cfg_scale = gr.Slider(
-                            minimum=1.0,
-                            maximum=20.0,
-                            value=6.0,
-                            label='Video Guidance Scale',
-                            visible=True)
-                        self.video_seed = gr.Slider(minimum=-1,
-                                                    maximum=10000000,
-                                                    value=-1,
-                                                    label='Video Seed',
-                                                    visible=True)
-                with gr.Row():
-                    self.chatbot_inst = """
-                       **Instruction**:
-                       1. Click 'Upload' button to upload one or more images as input images.
-                       2. Enter '@' in the text box will exhibit all images in the gallery.
-                       3. Select the image you wish to edit from the gallery, and its Image ID will be displayed in the text box.
-                       4. Compose the editing instruction for the selected image, incorporating image id '@xxxxxx' into your instruction.
-                       For example, you might say, "Change the girl's skirt in @123456 to blue." The '@xxxxx' token will facilitate the identification of the specific image, and will be automatically replaced by a special token '{image}' in the instruction. Furthermore, it is also possible to engage in text-to-image generation without any initial image input.
-                       5. Once your instructions are prepared, please click the "Chat" button to view the edited result in the chat window.
-                       6. **Important** To render text on an image, please ensure to include a space between each letter. For instance, "add text 'g i r l' on the mask area of @xxxxx".
-                       7. To implement local editing based on a specified mask, simply click on the image within the chat window to access the image editor. Here, you can draw a mask and then click the 'Submit' button to upload the edited image along with the mask. For inpainting tasks, select the 'Composite' mask type, while for outpainting tasks, choose the 'Outpainting' mask type. For all other local editing tasks, please select the 'Background' mask type.
-                       8. If you find our work valuable, we invite you to refer to the [ACE Page](https://ali-vilab.github.io/ace-page/) for comprehensive information.
-                    """
-                    self.legacy_inst = """
-                       **Instruction**:
-                       1. You can edit the image by uploading it; if no image is uploaded, an image will be generated from text..
-                       2. Enter '@' in the text box will exhibit all images in the gallery.
-                       3. Select the image you wish to edit from the gallery, and its Image ID will be displayed in the text box.
-                       4. **Important** To render text on an image, please ensure to include a space between each letter. For instance, "add text 'g i r l' on the mask area of @xxxxx".
-                       5. To perform multi-step editing, partial editing, inpainting, outpainting, and other operations, please click the Chatbot Checkbox to enable the conversational editing mode and follow the relevant instructions..
-                       6. If you find our work valuable, we invite you to refer to the [ACE Page](https://ali-vilab.github.io/ace-page/) for comprehensive information.
-                    """
-                    self.instruction = gr.Markdown(value=self.legacy_inst)
-                with gr.Row(variant='panel',
-                            equal_height=True,
-                            show_progress=False):
-                    with gr.Column(scale=1, min_width=100, visible=False) as self.upload_panel:
-                        self.upload_btn = gr.Button(value=upload_sty +
-                                                          ' Upload',
-                                                    variant='secondary')
-                    with gr.Column(scale=5, min_width=500):
-                        self.text = gr.Textbox(
-                            placeholder='Input "@" find history of image',
-                            label='Instruction',
-                            container=False)
-                    with gr.Column(scale=1, min_width=100):
-                        self.chat_btn = gr.Button(value='Generate',
-                                                  variant='primary')
-                    with gr.Column(scale=1, min_width=100):
-                        self.retry_btn = gr.Button(value=refresh_sty +
-                                                         ' Retry',
-                                                   variant='secondary')
-                    with gr.Column(scale=1, min_width=100):
-                        self.mode_checkbox = gr.Checkbox(
-                            value=False,
-                            label='ChatBot')
-                    with gr.Column(scale=(1 if self.enable_i2v else 0),
-                                   min_width=0):
-                        self.video_gen_btn = gr.Button(value=video_sty +
-                                                             ' Gen Video',
-                                                       variant='secondary',
-                                                       visible=self.enable_i2v)
-                    with gr.Column(scale=(1 if self.enable_i2v else 0),
-                                   min_width=0):
-                        self.extend_prompt = gr.Checkbox(
-                            value=True,
-                            label='Extend Prompt',
-                            visible=self.enable_i2v)
-                with gr.Row():
-                    self.gallery = gr.Gallery(visible=False,
-                                              label='History',
-                                              columns=10,
-                                              allow_preview=False,
-                                              interactive=False)
-                self.eg = gr.Column(visible=True)
-    def set_callbacks(self, *args, **kwargs):
-        ########################################
-        # @spaces.GPU(duration=60)
-        def change_model(model_name):
-            if model_name not in self.model_choices:
-                gr.Info('The provided model name is not a valid choice!')
-                return model_name, gr.update(), gr.update()
-            if model_name != self.model_name:
-                lock.acquire()
-                del self.pipe
-                torch.cuda.empty_cache()
-                torch.cuda.ipc_collect()
-                pipe_cfg = self.model_choices[model_name]
-                infer_name = pipe_cfg.get("INFERENCE_TYPE", "ACE")
-                self.pipe = inference_dict[infer_name]()
-                self.pipe.init_from_cfg(pipe_cfg)
-                self.model_name = model_name
-                lock.release()
-            return (model_name, gr.update(), gr.update(),
-                    gr.Slider(
-                        value=self.pipe.input.get("sample_steps", 20),
-                        visible=self.pipe.input.get("sample_steps", None) is not None),
-                    gr.Slider(
-                        value=self.pipe.input.get("guide_scale", 4.5),
-                        visible=self.pipe.input.get("guide_scale", None) is not None),
-                    gr.Slider(
-                        value=self.pipe.input.get("guide_rescale", 0.5),
-                        visible=self.pipe.input.get("guide_rescale", None) is not None),
-                    gr.Slider(
-                        value=self.pipe.input.get("output_height", 1024),
-                        visible=self.pipe.input.get("output_height", None) is not None),
-                    gr.Slider(
-                        value=self.pipe.input.get("output_width", 1024),
-                        visible=self.pipe.input.get("output_width", None) is not None),
-                    gr.Textbox(
-                        value=self.pipe.input.get("refiner_prompt", ""),
-                        visible=self.pipe.input.get("refiner_prompt", None) is not None),
-                    gr.Slider(
-                        value=self.pipe.input.get("refiner_scale", -1),
-                        visible=self.pipe.input.get("refiner_scale", None) is not None
-                    ),
-                    gr.Checkbox(
-                        value=self.pipe.input.get("use_ace", True),
-                        visible=self.pipe.input.get("use_ace", None) is not None
-                    )
-                    )
-        self.model_name_dd.change(
-            change_model,
-            inputs=[self.model_name_dd],
-            outputs=[
-                self.model_name_dd, self.chatbot, self.text,
-                self.step,
-                self.cfg_scale, self.rescale, self.output_height,
-                self.output_width, self.refiner_prompt, self.refiner_scale,
-                self.use_ace])
-        def mode_change(mode_check):
-            if mode_check:
-                # ChatBot
-                return (
-                    gr.Row(visible=False),
-                    gr.Row(visible=True),
-                    gr.Button(value='Generate'),
-                    gr.State(value='chatbot'),
-                    gr.Column(visible=True),
-                    gr.Markdown(value=self.chatbot_inst)
-                )
-            else:
-                # Legacy
-                return (
-                    gr.Row(visible=True),
-                    gr.Row(visible=False),
-                    gr.Button(value=chat_sty + ' Chat'),
-                    gr.State(value='legacy'),
-                    gr.Column(visible=False),
-                    gr.Markdown(value=self.legacy_inst)
-                )
-        self.mode_checkbox.change(mode_change, inputs=[self.mode_checkbox],
-                                  outputs=[self.legacy_group, self.chat_group,
-                                           self.chat_btn, self.ui_mode,
-                                           self.upload_panel, self.instruction])
-        ########################################
-        def generate_gallery(text, images):
-            if text.endswith(' '):
-                return gr.update(), gr.update(visible=False)
-            elif text.endswith('@'):
-                gallery_info = []
-                for image_id, image_meta in images.items():
-                    thumbnail_path = image_meta['thumbnail']
-                    gallery_info.append((thumbnail_path, image_id))
-                return gr.update(), gr.update(visible=True, value=gallery_info)
-            else:
-                gallery_info = []
-                match = re.search('@([^@ ]+)$', text)
-                if match:
-                    prefix = match.group(1)
-                    for image_id, image_meta in images.items():
-                        if not image_id.startswith(prefix):
-                            continue
-                        thumbnail_path = image_meta['thumbnail']
-                        gallery_info.append((thumbnail_path, image_id))
-                    if len(gallery_info) > 0:
-                        return gr.update(), gr.update(visible=True,
-                                                      value=gallery_info)
-                    else:
-                        return gr.update(), gr.update(visible=False)
-                else:
-                    return gr.update(), gr.update(visible=False)
-        self.text.input(generate_gallery,
-                        inputs=[self.text, self.images],
-                        outputs=[self.text, self.gallery],
-                        show_progress='hidden')
-        ########################################
-        def select_image(text, evt: gr.SelectData):
-            image_id = evt.value['caption']
-            text = '@'.join(text.split('@')[:-1]) + f'@{image_id} '
-            return gr.update(value=text), gr.update(visible=False, value=None)
-        self.gallery.select(select_image,
-                            inputs=self.text,
-                            outputs=[self.text, self.gallery])
-        ########################################
-        def generate_video(message,
-                           extend_prompt,
-                           history,
-                           images,
-                           num_steps,
-                           num_frames,
-                           cfg_scale,
-                           fps,
-                           seed,
-                           progress=gr.Progress(track_tqdm=True)):
-            from diffusers.utils import export_to_video
-            generator = torch.Generator(device='cuda').manual_seed(seed)
-            img_ids = re.findall('@(.*?)[ ,;.?$]', message)
-            if len(img_ids) == 0:
-                history.append((
-                    message,
-                    'Sorry, no images were found in the prompt to be used as the first frame of the video.'
-                ))
-                while len(history) >= self.max_msgs:
-                    history.pop(0)
-                return history, self.get_history(
-                    history), gr.update(), gr.update(visible=False)
-            img_id = img_ids[0]
-            prompt = re.sub(f'@{img_id}\s+', '', message)
-            if extend_prompt:
-                messages = copy.deepcopy(self.enhance_ctx)
-                messages.append({
-                    'role':
-                        'user',
-                    'content':
-                        f'Create an imaginative video descriptive caption or modify an earlier caption in ENGLISH for the user input: "{prompt}"',
-                })
-                lock.acquire()
-                outputs = self.enhancer(
-                    messages,
-                    max_new_tokens=200,
-                )
-                prompt = outputs[0]['generated_text'][-1]['content']
-                print(prompt)
-                lock.release()
-            img_meta = images[img_id]
-            img_path = img_meta['image']
-            image = Image.open(img_path).convert('RGB')
-            lock.acquire()
-            video = self.i2v_pipe(
-                prompt=prompt,
-                image=image,
-                num_videos_per_prompt=1,
-                num_inference_steps=num_steps,
-                num_frames=num_frames,
-                guidance_scale=cfg_scale,
-                generator=generator,
-            ).frames[0]
-            lock.release()
-            out_video_path = export_to_video(video, fps=fps)
-            history.append((
-                f"Based on first frame @{img_id} and description '{prompt}', generate a video",
-                'This is generated video:'))
-            history.append((None, out_video_path))
-            while len(history) >= self.max_msgs:
-                history.pop(0)
-            return history, self.get_history(history), gr.update(
-                value=''), gr.update(visible=False)
-        self.video_gen_btn.click(
-            generate_video,
-            inputs=[
-                self.text, self.extend_prompt, self.history, self.images,
-                self.video_step, self.video_frames, self.video_cfg_scale,
-                self.video_fps, self.video_seed
-            ],
-            outputs=[self.history, self.chatbot, self.text, self.gallery])
-        ########################################
-        @spaces.GPU(duration=120)
-        def run_chat(
-                message,
-                legacy_image,
-                ui_mode,
-                use_ace,
-                extend_prompt,
-                history,
-                images,
-                use_history,
-                history_result,
-                negative_prompt,
-                cfg_scale,
-                rescale,
-                refiner_prompt,
-                refiner_scale,
-                step,
-                seed,
-                output_h,
-                output_w,
-                video_auto,
-                video_steps,
-                video_frames,
-                video_cfg_scale,
-                video_fps,
-                video_seed,
-                progress=gr.Progress(track_tqdm=True)):
-            legacy_img_ids = []
-            if ui_mode == 'legacy':
-                if legacy_image is not None:
-                    history, images, img_id = self.add_uploaded_image_to_history(
-                        legacy_image, history, images)
-                    legacy_img_ids.append(img_id)
-            retry_msg = message
-            gen_id = get_md5(message)[:12]
-            save_path = os.path.join(self.cache_dir, f'{gen_id}.png')
-            img_ids = re.findall('@(.*?)[ ,;.?$]', message)
-            history_io = None
-            if len(img_ids) < 1:
-                img_ids = legacy_img_ids
-                for img_id in img_ids:
-                    if f'@{img_id}' not in message:
-                        message = f'@{img_id} ' + message
-            new_message = message
-            if len(img_ids) > 0:
-                edit_image, edit_image_mask, edit_task = [], [], []
-                for i, img_id in enumerate(img_ids):
-                    if img_id not in images:
-                        gr.Info(
-                            f'The input image ID {img_id} is not exist... Skip loading image.'
-                        )
-                        continue
-                    placeholder = '{image}' if i == 0 else '{' + f'image{i}' + '}'
-                    if placeholder not in new_message:
-                        new_message = re.sub(f'@{img_id}', placeholder,
-                                             new_message)
-                    else:
-                        new_message = re.sub(f'@{img_id} ', "",
-                                             new_message, 1)
-                    img_meta = images[img_id]
-                    img_path = img_meta['image']
-                    img_mask = img_meta['mask']
-                    img_mask_type = img_meta['mask_type']
-                    if img_mask_type is not None and img_mask_type == 'Composite':
-                        task = 'inpainting'
-                    else:
-                        task = ''
-                    edit_image.append(Image.open(img_path).convert('RGB'))
-                    edit_image_mask.append(
-                        Image.open(img_mask).
-                        convert('L') if img_mask is not None else None)
-                    edit_task.append(task)
-                    if use_history and (img_id in history_result):
-                        history_io = history_result[img_id]
-                buffered = io.BytesIO()
-                edit_image[0].save(buffered, format='PNG')
-                img_b64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
-                img_str = f'<img src="data:image/png;base64,{img_b64}" style="pointer-events: none;">'
-                pre_info = f'Received one or more images, so image editing is conducted.\n The first input image @{img_ids[0]} is:\n {img_str}'
-            else:
-                pre_info = 'No image ids were found in the provided text prompt, so text-guided image generation is conducted. \n'
-                edit_image = None
-                edit_image_mask = None
-                edit_task = ''
-            if new_message == "":
-                new_message = "a beautiful girl wear a skirt."
-            print(new_message)
-            imgs = self.pipe(
-                image=edit_image,
-                mask=edit_image_mask,
-                task=edit_task,
-                prompt=[new_message] *
-                       len(edit_image) if edit_image is not None else [new_message],
-                negative_prompt=[negative_prompt] * len(edit_image)
-                if edit_image is not None else [negative_prompt],
-                history_io=history_io,
-                output_height=output_h,
-                output_width=output_w,
-                sampler=self.pipe.input.get("sampler", "ddim"),
-                sample_steps=step,
-                guide_scale=cfg_scale,
-                guide_rescale=rescale,
-                seed=seed,
-                refiner_prompt=refiner_prompt,
-                refiner_scale=refiner_scale,
-                use_ace=use_ace
-            )
-            img = imgs[0]
-            img.save(save_path, format='JPEG')
-            if history_io:
-                history_io_new = copy.deepcopy(history_io)
-                history_io_new['image'] += edit_image[:1]
-                history_io_new['mask'] += edit_image_mask[:1]
-                history_io_new['task'] += edit_task[:1]
-                history_io_new['prompt'] += [new_message]
-                history_io_new['image'] = history_io_new['image'][-5:]
-                history_io_new['mask'] = history_io_new['mask'][-5:]
-                history_io_new['task'] = history_io_new['task'][-5:]
-                history_io_new['prompt'] = history_io_new['prompt'][-5:]
-                history_result[gen_id] = history_io_new
-            elif edit_image is not None and len(edit_image) > 0:
-                history_io_new = {
-                    'image': edit_image[:1],
-                    'mask': edit_image_mask[:1],
-                    'task': edit_task[:1],
-                    'prompt': [new_message]
-                }
-                history_result[gen_id] = history_io_new
-            w, h = img.size
-            if w > h:
-                tb_w = 128
-                tb_h = int(h * tb_w / w)
-            else:
-                tb_h = 128
-                tb_w = int(w * tb_h / h)
-            thumbnail_path = os.path.join(self.cache_dir,
-                                          f'{gen_id}_thumbnail.jpg')
-            thumbnail = img.resize((tb_w, tb_h))
-            thumbnail.save(thumbnail_path, format='JPEG')
-            images[gen_id] = {
-                'image': save_path,
-                'mask': None,
-                'mask_type': None,
-                'thumbnail': thumbnail_path
-            }
-            buffered = io.BytesIO()
-            img.convert('RGB').save(buffered, format='JPEG')
-            img_b64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
-            img_str = f'<img src="data:image/jpg;base64,{img_b64}" style="pointer-events: none;">'
-            history.append(
-                (message,
-                 f'{pre_info} The generated image @{gen_id} is:\n {img_str}'))
-            if video_auto:
-                if video_seed is None or video_seed == -1:
-                    video_seed = random.randint(0, 10000000)
-                lock.acquire()
-                generator = torch.Generator(
-                    device='cuda').manual_seed(video_seed)
-                pixel_values = load_image(img.convert('RGB'),
-                                          max_num=self.llm_max_num).to(
-                    torch.bfloat16).cuda()
-                prompt = self.captioner.chat(self.llm_tokenizer, pixel_values,
-                                             self.llm_prompt,
-                                             self.llm_generation_config)
-                print(prompt)
-                lock.release()
-                if extend_prompt:
-                    messages = copy.deepcopy(self.enhance_ctx)
-                    messages.append({
-                        'role':
-                            'user',
-                        'content':
-                            f'Create an imaginative video descriptive caption or modify an earlier caption in ENGLISH for the user input: "{prompt}"',
-                    })
-                    lock.acquire()
-                    outputs = self.enhancer(
-                        messages,
-                        max_new_tokens=200,
-                    )
-                    prompt = outputs[0]['generated_text'][-1]['content']
-                    print(prompt)
-                    lock.release()
-                lock.acquire()
-                video = self.i2v_pipe(
-                    prompt=prompt,
-                    image=img,
-                    num_videos_per_prompt=1,
-                    num_inference_steps=video_steps,
-                    num_frames=video_frames,
-                    guidance_scale=video_cfg_scale,
-                    generator=generator,
-                ).frames[0]
-                lock.release()
-                out_video_path = export_to_video(video, fps=video_fps)
-                history.append((
-                    f"Based on first frame @{gen_id} and description '{prompt}', generate a video",
-                    'This is generated video:'))
-                history.append((None, out_video_path))
-            while len(history) >= self.max_msgs:
-                history.pop(0)
-            return (history, images, gr.Image(value=save_path),
-                    history_result, self.get_history(
-                history), gr.update(), gr.update(
-                visible=False), retry_msg)
-        chat_inputs = [
-            self.legacy_image_uploader, self.ui_mode, self.use_ace,
-            self.extend_prompt, self.history, self.images, self.use_history,
-            self.history_result, self.negative_prompt, self.cfg_scale,
-            self.rescale, self.refiner_prompt, self.refiner_scale,
-            self.step, self.seed, self.output_height,
-            self.output_width, self.video_auto, self.video_step,
-            self.video_frames, self.video_cfg_scale, self.video_fps,
-            self.video_seed
-        ]
-        chat_outputs = [
-            self.history, self.images, self.legacy_image_viewer,
-            self.history_result, self.chatbot,
-            self.text, self.gallery, self.retry_msg
-        ]
-        self.chat_btn.click(run_chat,
-                            inputs=[self.text] + chat_inputs,
-                            outputs=chat_outputs)
-        self.text.submit(run_chat,
-                         inputs=[self.text] + chat_inputs,
-                         outputs=chat_outputs)
-        def retry_fn(*args):
-            return run_chat(*args)
-        self.retry_btn.click(retry_fn,
-                             inputs=[self.retry_msg] + chat_inputs,
-                             outputs=chat_outputs)
-        ########################################
-        @spaces.GPU(duration=120)
-        def run_example(task, img, img_mask, ref1, prompt, seed):
-            edit_image, edit_image_mask, edit_task = [], [], []
-            if img is not None:
-                w, h = img.size
-                if w > 2048:
-                    ratio = w / 2048.
-                    w = 2048
-                    h = int(h / ratio)
-                if h > 2048:
-                    ratio = h / 2048.
-                    h = 2048
-                    w = int(w / ratio)
-                img = img.resize((w, h))
-                edit_image.append(img)
-                if img_mask is not None:
-                    img_mask = img_mask if np.sum(np.array(img_mask)) > 0 else None
-                edit_image_mask.append(
-                    img_mask if img_mask is not None else None)
-                edit_task.append(task)
-                if ref1 is not None:
-                    ref1 = ref1 if np.sum(np.array(ref1)) > 0 else None
-                if ref1 is not None:
-                    edit_image.append(ref1)
-                    edit_image_mask.append(None)
-                    edit_task.append('')
-                buffered = io.BytesIO()
-                img.save(buffered, format='PNG')
-                img_b64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
-                img_str = f'<img src="data:image/png;base64,{img_b64}" style="pointer-events: none;">'
-                pre_info = f'Received one or more images, so image editing is conducted.\n The first input image is:\n {img_str}'
-            else:
-                pre_info = 'No image ids were found in the provided text prompt, so text-guided image generation is conducted. \n'
-                edit_image = None
-                edit_image_mask = None
-                edit_task = ''
-            img_num = len(edit_image) if edit_image is not None else 1
-            imgs = self.pipe(
-                image=edit_image,
-                mask=edit_image_mask,
-                task=edit_task,
-                prompt=[prompt] * img_num,
-                negative_prompt=[''] * img_num,
-                seed=seed,
-                refiner_prompt=self.pipe.input.get("refiner_prompt", ""),
-                refiner_scale=self.pipe.input.get("refiner_scale", 0.0),
-            )
-            img = imgs[0]
-            buffered = io.BytesIO()
-            img.convert('RGB').save(buffered, format='JPEG')
-            img_b64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
-            img_str = f'<img src="data:image/png;base64,{img_b64}" style="pointer-events: none;">'
-            history = [(prompt,
-                        f'{pre_info} The generated image is:\n {img_str}')]
-            img_id = get_md5(img_b64)[:12]
-            save_path = os.path.join(self.cache_dir, f'{img_id}.jpg')
-            img.convert('RGB').save(save_path)
-            return self.get_history(history), gr.update(value=prompt), gr.update(
-                visible=False), gr.update(value=save_path), gr.update(value=-1)
-        with self.eg:
-            self.example_task = gr.Text(label='Task Name',
-                                        value='',
-                                        visible=False)
-            self.example_image = gr.Image(label='Edit Image',
-                                          type='pil',
-                                          image_mode='RGB',
-                                          visible=False)
-            self.example_mask = gr.Image(label='Edit Image Mask',
-                                         type='pil',
-                                         image_mode='L',
-                                         visible=False)
-            self.example_ref_im1 = gr.Image(label='Ref Image',
-                                            type='pil',
-                                            image_mode='RGB',
-                                            visible=False)
-            self.examples = gr.Examples(
-                fn=run_example,
-                examples=self.chatbot_examples,
-                inputs=[
-                    self.example_task, self.example_image, self.example_mask,
-                    self.example_ref_im1, self.text, self.seed
-                ],
-                outputs=[self.chatbot, self.text, self.gallery, self.legacy_image_viewer, self.seed],
-                examples_per_page=4,
-                cache_examples=False,
-                run_on_click=True)
-        ########################################
-        def upload_image():
-            return (gr.update(visible=True,
-                              scale=1), gr.update(visible=True, scale=1),
-                    gr.update(visible=True), gr.update(visible=False),
-                    gr.update(visible=False), gr.update(visible=False),
-                    gr.update(visible=True))
-        self.upload_btn.click(upload_image,
-                              inputs=[],
-                              outputs=[
-                                  self.chat_page, self.editor_page,
-                                  self.upload_tab, self.edit_tab,
-                                  self.image_view_tab, self.video_view_tab,
-                                  self.upload_tabs
-                              ])
-        ########################################
-        def edit_image(evt: gr.SelectData):
-            if isinstance(evt.value, str):
-                img_b64s = re.findall(
-                    '<img src="data:image/png;base64,(.*?)" style="pointer-events: none;">',
-                    evt.value)
-                imgs = [
-                    Image.open(io.BytesIO(base64.b64decode(copy.deepcopy(i))))
-                    for i in img_b64s
-                ]
-                if len(imgs) > 0:
-                    if len(imgs) == 2:
-                        if self.gradio_version >= '5.0.0':
-                            view_img = copy.deepcopy(imgs[-1])
-                        else:
-                            view_img = copy.deepcopy(imgs)
-                        edit_img = copy.deepcopy(imgs[-1])
-                    else:
-                        if self.gradio_version >= '5.0.0':
-                            view_img = copy.deepcopy(imgs[-1])
-                        else:
-                            view_img = [
-                                copy.deepcopy(imgs[-1]),
-                                copy.deepcopy(imgs[-1])
-                            ]
-                        edit_img = copy.deepcopy(imgs[-1])
-                    return (gr.update(visible=True,
-                                      scale=1), gr.update(visible=True,
-                                                          scale=1),
-                            gr.update(visible=False), gr.update(visible=True),
-                            gr.update(visible=True), gr.update(visible=False),
-                            gr.update(value=edit_img),
-                            gr.update(value=view_img), gr.update(value=None),
-                            gr.update(visible=True))
-                else:
-                    return (gr.update(), gr.update(), gr.update(), gr.update(),
-                            gr.update(), gr.update(), gr.update(), gr.update(),
-                            gr.update(), gr.update())
-            elif isinstance(evt.value, dict) and evt.value.get(
-                    'component', '') == 'video':
-                value = evt.value['value']['video']['path']
-                return (gr.update(visible=True,
-                                  scale=1), gr.update(visible=True, scale=1),
-                        gr.update(visible=False), gr.update(visible=False),
-                        gr.update(visible=False), gr.update(visible=True),
-                        gr.update(), gr.update(), gr.update(value=value),
-                        gr.update())
-            else:
-                return (gr.update(), gr.update(), gr.update(), gr.update(),
-                        gr.update(), gr.update(), gr.update(), gr.update(),
-                        gr.update(), gr.update())
-        self.chatbot.select(edit_image,
-                            outputs=[
-                                self.chat_page, self.editor_page,
-                                self.upload_tab, self.edit_tab,
-                                self.image_view_tab, self.video_view_tab,
-                                self.image_editor, self.image_viewer,
-                                self.video_viewer, self.edit_tabs
-                            ])
-        if self.gradio_version < '5.0.0':
-            self.image_viewer.change(lambda x: x,
-                                     inputs=self.image_viewer,
-                                     outputs=self.image_viewer)
-        ########################################
-        def submit_upload_image(image, history, images):
-            history, images, _ = self.add_uploaded_image_to_history(
-                image, history, images)
-            return gr.update(visible=False), gr.update(
-                visible=True), gr.update(
-                value=self.get_history(history)), history, images
-        self.sub_btn_1.click(
-            submit_upload_image,
-            inputs=[self.image_uploader, self.history, self.images],
-            outputs=[
-                self.editor_page, self.chat_page, self.chatbot, self.history,
-                self.images
-            ])
-        ########################################
-        def submit_edit_image(imagemask, mask_type, history, images):
-            history, images = self.add_edited_image_to_history(
-                imagemask, mask_type, history, images)
-            return gr.update(visible=False), gr.update(
-                visible=True), gr.update(
-                value=self.get_history(history)), history, images
-        self.sub_btn_2.click(submit_edit_image,
-                             inputs=[
-                                 self.image_editor, self.mask_type,
-                                 self.history, self.images
-                             ],
-                             outputs=[
-                                 self.editor_page, self.chat_page,
-                                 self.chatbot, self.history, self.images
-                             ])
-        ########################################
-        def exit_edit():
-            return gr.update(visible=False), gr.update(visible=True, scale=3)
-        self.ext_btn_1.click(exit_edit,
-                             outputs=[self.editor_page, self.chat_page])
-        self.ext_btn_2.click(exit_edit,
-                             outputs=[self.editor_page, self.chat_page])
-        self.ext_btn_3.click(exit_edit,
-                             outputs=[self.editor_page, self.chat_page])
-        self.ext_btn_4.click(exit_edit,
-                             outputs=[self.editor_page, self.chat_page])
-        ########################################
-        def update_mask_type_info(mask_type):
-            if mask_type == 'Background':
-                info = 'Background mode will not erase the visual content in the mask area'
-                visible = False
-            elif mask_type == 'Composite':
-                info = 'Composite mode will erase the visual content in the mask area'
-                visible = False
-            elif mask_type == 'Outpainting':
-                info = 'Outpaint mode is used for preparing input image for outpainting task'
-                visible = True
-            return (gr.update(
-                visible=True,
-                value=
-                f"<div style='background-color: white; padding-left: 15px; color: grey;'>{info}</div>"
-            ), gr.update(visible=visible))
-        self.mask_type.change(update_mask_type_info,
-                              inputs=self.mask_type,
-                              outputs=[self.mask_type_info, self.outpaint_tab])
-        ########################################
-        def extend_image(top_ratio, bottom_ratio, left_ratio, right_ratio,
-                         image):
-            img = cv2.cvtColor(image['background'], cv2.COLOR_RGBA2RGB)
-            h, w = img.shape[:2]
-            new_h = int(h * (top_ratio + bottom_ratio + 1))
-            new_w = int(w * (left_ratio + right_ratio + 1))
-            start_h = int(h * top_ratio)
-            start_w = int(w * left_ratio)
-            new_img = np.zeros((new_h, new_w, 3), dtype=np.uint8)
-            new_mask = np.ones((new_h, new_w, 1), dtype=np.uint8) * 255
-            new_img[start_h:start_h + h, start_w:start_w + w, :] = img
-            new_mask[start_h:start_h + h, start_w:start_w + w] = 0
-            layer = np.concatenate([new_img, new_mask], axis=2)
-            value = {
-                'background': new_img,
-                'composite': new_img,
-                'layers': [layer]
-            }
-            return gr.update(value=value)
-        self.img_pad_btn.click(extend_image,
-                               inputs=[
-                                   self.top_ext, self.bottom_ext,
-                                   self.left_ext, self.right_ext,
-                                   self.image_editor
-                               ],
-                               outputs=self.image_editor)
-        ########################################
-        def clear_chat(history, images, history_result):
-            history.clear()
-            images.clear()
-            history_result.clear()
-            return history, images, history_result, self.get_history(history)
-        self.clear_btn.click(
-            clear_chat,
-            inputs=[self.history, self.images, self.history_result],
-            outputs=[
-                self.history, self.images, self.history_result, self.chatbot
-            ])
-    def get_history(self, history):
-        info = []
-        for item in history:
-            new_item = [None, None]
-            if isinstance(item[0], str) and item[0].endswith('.mp4'):
-                new_item[0] = gr.Video(item[0], format='mp4')
-            else:
-                new_item[0] = item[0]
-            if isinstance(item[1], str) and item[1].endswith('.mp4'):
-                new_item[1] = gr.Video(item[1], format='mp4')
-            else:
-                new_item[1] = item[1]
-            info.append(new_item)
-        return info
-    def generate_random_string(self, length=20):
-        letters_and_digits = string.ascii_letters + string.digits
-        random_string = ''.join(
-            random.choice(letters_and_digits) for i in range(length))
-        return random_string
-    def add_edited_image_to_history(self, image, mask_type, history, images):
-        if mask_type == 'Composite':
-            img = Image.fromarray(image['composite'])
-        else:
-            img = Image.fromarray(image['background'])
-        img_id = get_md5(self.generate_random_string())[:12]
-        save_path = os.path.join(self.cache_dir, f'{img_id}.png')
-        img.convert('RGB').save(save_path)
-        mask = image['layers'][0][:, :, 3]
-        mask = Image.fromarray(mask).convert('RGB')
-        mask_path = os.path.join(self.cache_dir, f'{img_id}_mask.png')
-        mask.save(mask_path)
-        w, h = img.size
-        if w > h:
-            tb_w = 128
-            tb_h = int(h * tb_w / w)
-        else:
-            tb_h = 128
-            tb_w = int(w * tb_h / h)
-        if mask_type == 'Background':
-            comp_mask = np.array(mask, dtype=np.uint8)
-            mask_alpha = (comp_mask[:, :, 0:1].astype(np.float32) *
-                          0.6).astype(np.uint8)
-            comp_mask = np.concatenate([comp_mask, mask_alpha], axis=2)
-            thumbnail = Image.alpha_composite(
-                img.convert('RGBA'),
-                Image.fromarray(comp_mask).convert('RGBA')).convert('RGB')
-        else:
-            thumbnail = img.convert('RGB')
-        thumbnail_path = os.path.join(self.cache_dir,
-                                      f'{img_id}_thumbnail.jpg')
-        thumbnail = thumbnail.resize((tb_w, tb_h))
-        thumbnail.save(thumbnail_path, format='JPEG')
-        buffered = io.BytesIO()
-        img.convert('RGB').save(buffered, format='PNG')
-        img_b64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
-        img_str = f'<img src="data:image/png;base64,{img_b64}" style="pointer-events: none;">'
-        buffered = io.BytesIO()
-        mask.convert('RGB').save(buffered, format='PNG')
-        mask_b64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
-        mask_str = f'<img src="data:image/png;base64,{mask_b64}" style="pointer-events: none;">'
-        images[img_id] = {
-            'image': save_path,
-            'mask': mask_path,
-            'mask_type': mask_type,
-            'thumbnail': thumbnail_path
-        }
-        history.append((
-            None,
-            f'This is edited image and mask:\n {img_str} {mask_str} image ID is: {img_id}'
-        ))
-        return history, images
-    def add_uploaded_image_to_history(self, img, history, images):
-        img_id = get_md5(self.generate_random_string())[:12]
-        save_path = os.path.join(self.cache_dir, f'{img_id}.png')
-        w, h = img.size
-        if w > 2048:
-            ratio = w / 2048.
-            w = 2048
-            h = int(h / ratio)
-        if h > 2048:
-            ratio = h / 2048.
-            h = 2048
-            w = int(w / ratio)
-        img = img.resize((w, h))
-        img.save(save_path)
-        w, h = img.size
-        if w > h:
-            tb_w = 128
-            tb_h = int(h * tb_w / w)
-        else:
-            tb_h = 128
-            tb_w = int(w * tb_h / h)
-        thumbnail_path = os.path.join(self.cache_dir,
-                                      f'{img_id}_thumbnail.jpg')
-        thumbnail = img.resize((tb_w, tb_h))
-        thumbnail.save(thumbnail_path, format='JPEG')
-        images[img_id] = {
-            'image': save_path,
-            'mask': None,
-            'mask_type': None,
-            'thumbnail': thumbnail_path
-        }
-        buffered = io.BytesIO()
-        img.convert('RGB').save(buffered, format='PNG')
-        img_b64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
-        img_str = f'<img src="data:image/png;base64,{img_b64}" style="pointer-events: none;">'
-        history.append(
-            (None,
-             f'This is uploaded image:\n {img_str} image ID is: {img_id}'))
-        return history, images, img_id
-if __name__ == '__main__':
-    cfg = "config/chatbot_ui.yaml"
-    with gr.Blocks() as demo:
-        chatbot = ChatBotUI(cfg)
-        chatbot.create_ui()
-        chatbot.set_callbacks()
-    demo.launch()

config/chatbot_ui.yaml DELETED Viewed

@@ -1,25 +0,0 @@
-WORK_DIR: chatbot
-FILE_SYSTEM:
-  - NAME: LocalFs
-    TEMP_DIR: ./cache
-  - NAME: ModelscopeFs
-    TEMP_DIR: ./cache
-  - NAME: HuggingfaceFs
-    TEMP_DIR: ./cache
-#
-ENABLE_I2V: False
-SKIP_EXAMPLES: True
-#
-MODEL:
-  EDIT_MODEL:
-    MODEL_CFG_DIR: config/models/
-  I2V:
-    MODEL_NAME: CogVideoX-5b-I2V
-    MODEL_DIR: ms://ZhipuAI/CogVideoX-5b-I2V/
-  CAPTIONER:
-    MODEL_NAME: InternVL2-2B
-    MODEL_DIR: ms://OpenGVLab/InternVL2-2B/
-    PROMPT: '<image>\nThis image is the first frame of a video. Based on this image, please imagine what changes may occur in the next few seconds of the video. Please output brief description, such as "a dog running" or "a person turns to left". No more than 30 words.'
-  ENHANCER:
-    MODEL_NAME: Meta-Llama-3.1-8B-Instruct
-    MODEL_DIR: ms://LLM-Research/Meta-Llama-3.1-8B-Instruct/

config/models/ace_flux_dev.yaml DELETED Viewed

@@ -1,187 +0,0 @@
-NAME: ACE_FLUX.1_dev
-IS_DEFAULT: True
-USE_DYNAMIC_MODEL: False
-INFERENCE_TYPE: ACE_FLUX
-MAX_SEQ_LENGTH: 3072
-SRC_MAX_SEQ_LENGTH: 2048
-DEFAULT_PARAS:
-  PARAS:
-  #
-  INPUT:
-    INPUT_IMAGE:
-    INPUT_MASK:
-    TASK:
-    PROMPT: ""
-    OUTPUT_HEIGHT: 1024
-    OUTPUT_WIDTH: 1024
-    SAMPLER: flow_euler
-    SAMPLE_STEPS: 20
-    GUIDE_SCALE: 3.5
-    SEED: -1
-    TAR_INDEX: 0
-    ALIGN: False
-  OUTPUT:
-    LATENT:
-    IMAGES:
-    SEED:
-  MODULES_PARAS:
-    FIRST_STAGE_MODEL:
-      FUNCTION:
-        - NAME: encode
-          DTYPE: bfloat16
-          INPUT: [ "IMAGE" ]
-        - NAME: decode
-          DTYPE: bfloat16
-          INPUT: [ "LATENT" ]
-      PARAS:
-        SCALE_FACTOR: 1.5305
-        SHIFT_FACTOR: 0.0609
-        SIZE_FACTOR: 8
-    DIFFUSION_MODEL:
-      FUNCTION:
-        - NAME: forward
-          DTYPE: bfloat16
-          INPUT: [ "SAMPLE_STEPS", "SAMPLE", "GUIDE_SCALE" ]
-    COND_STAGE_MODEL:
-      FUNCTION:
-        - NAME: encode_list
-          DTYPE: bfloat16
-          INPUT: [ "PROMPT" ]
-#
-MODEL:
-  NAME: LatentDiffusionACEFlux
-  PARAMETERIZATION: rf
-  PRETRAINED_MODEL:
-  IGNORE_KEYS: [ ]
-  SIZE_FACTOR: 8
-  TEXT_IDENTIFIER: [ '{image}', '{image1}', '{image2}', '{image3}', '{image4}', '{image5}', '{image6}', '{image7}', '{image8}', '{image9}' ]
-  USE_TEXT_POS_EMBEDDINGS: True
-  DIFFUSION:
-    # NAME DESCRIPTION:  TYPE:  default: 'DiffusionFluxRF'
-    NAME: DiffusionFluxRF
-    PREDICTION_TYPE: raw
-    # NOISE_SCHEDULER DESCRIPTION:  TYPE:  default: ''
-    NOISE_SCHEDULER:
-        NAME: FlowMatchFluxShiftScheduler
-        SHIFT: True
-        SIGMOID_SCALE: 1
-        BASE_SHIFT: 0.5
-        MAX_SHIFT: 1.15
-      #
-  DIFFUSION_MODEL:
-    # NAME DESCRIPTION:  TYPE:  default: 'Flux'
-    NAME: ACEFlux
-    PRETRAINED_MODEL: hf://black-forest-labs/[email protected]
-    SWIFT_LORA_MODEL: ["hf://scepter-studio/ACE-FLUX.1-dev@ace_flux.1_dev_lora.bin"]
-    # IN_CHANNELS DESCRIPTION: model's input channels. TYPE: int default: 64
-    IN_CHANNELS: 64
-    # HIDDEN_SIZE DESCRIPTION: model's hidden size. TYPE: int default: 1024
-    HIDDEN_SIZE: 3072
-    # NUM_HEADS DESCRIPTION: number of heads in the transformer. TYPE: int default: 16
-    NUM_HEADS: 24
-    # AXES_DIM DESCRIPTION: dimensions of the axes of the positional encoding. TYPE: list default: [16, 56, 56]
-    AXES_DIM: [ 16, 56, 56 ]
-    # THETA DESCRIPTION: theta for positional encoding. TYPE: int default: 10000
-    THETA: 10000
-    # VEC_IN_DIM DESCRIPTION: dimension of the vector input. TYPE: int default: 768
-    VEC_IN_DIM: 768
-    # GUIDANCE_EMBED DESCRIPTION: whether to use guidance embedding. TYPE: bool default: False
-    GUIDANCE_EMBED: True
-    # CONTEXT_IN_DIM DESCRIPTION: dimension of the context input. TYPE: int default: 4096
-    CONTEXT_IN_DIM: 4096
-    # MLP_RATIO DESCRIPTION: ratio of mlp hidden size to hidden size. TYPE: float default: 4.0
-    MLP_RATIO: 4.0
-    # QKV_BIAS DESCRIPTION: whether to use bias in qkv projection. TYPE: bool default: True
-    QKV_BIAS: True
-    # DEPTH DESCRIPTION: number of transformer blocks. TYPE: int default: 19
-    DEPTH: 19
-    # DEPTH_SINGLE_BLOCKS DESCRIPTION: number of transformer blocks in the single stream block. TYPE: int default: 38
-    DEPTH_SINGLE_BLOCKS: 38
-    ATTN_BACKEND: pytorch
-  #
-  FIRST_STAGE_MODEL:
-    NAME: AutoencoderKLFlux
-    EMBED_DIM: 16
-    PRETRAINED_MODEL: hf://black-forest-labs/[email protected]
-    IGNORE_KEYS: [ ]
-    BATCH_SIZE: 8
-    USE_CONV: False
-    SCALE_FACTOR: 0.3611
-    SHIFT_FACTOR: 0.1159
-    #
-    ENCODER:
-      NAME: Encoder
-      USE_CHECKPOINT: True
-      CH: 128
-      OUT_CH: 3
-      NUM_RES_BLOCKS: 2
-      IN_CHANNELS: 3
-      ATTN_RESOLUTIONS: [ ]
-      CH_MULT: [ 1, 2, 4, 4 ]
-      Z_CHANNELS: 16
-      DOUBLE_Z: True
-      DROPOUT: 0.0
-      RESAMP_WITH_CONV: True
-    #
-    DECODER:
-      NAME: Decoder
-      USE_CHECKPOINT: True
-      CH: 128
-      OUT_CH: 3
-      NUM_RES_BLOCKS: 2
-      IN_CHANNELS: 3
-      ATTN_RESOLUTIONS: [ ]
-      CH_MULT: [ 1, 2, 4, 4 ]
-      Z_CHANNELS: 16
-      DROPOUT: 0.0
-      RESAMP_WITH_CONV: True
-      GIVE_PRE_END: False
-      TANH_OUT: False
-  #
-  COND_STAGE_MODEL:
-    # NAME DESCRIPTION:  TYPE:  default: 'T5PlusClipFluxEmbedder'
-    NAME: T5ACEPlusClipFluxEmbedder
-    # T5_MODEL DESCRIPTION:  TYPE:  default: ''
-    T5_MODEL:
-      # NAME DESCRIPTION:  TYPE:  default: 'HFEmbedder'
-      NAME: ACEHFEmbedder
-      # HF_MODEL_CLS DESCRIPTION: huggingface cls in transfomer TYPE: NoneType default: None
-      HF_MODEL_CLS: T5EncoderModel
-      # MODEL_PATH DESCRIPTION: model folder path TYPE: NoneType default: None
-      MODEL_PATH: hf://black-forest-labs/FLUX.1-dev@text_encoder_2/
-      # HF_TOKENIZER_CLS DESCRIPTION: huggingface cls in transfomer TYPE: NoneType default: None
-      HF_TOKENIZER_CLS: T5Tokenizer
-      # TOKENIZER_PATH DESCRIPTION: tokenizer folder path TYPE: NoneType default: None
-      TOKENIZER_PATH: hf://black-forest-labs/FLUX.1-dev@tokenizer_2/
-      ADDED_IDENTIFIER: [ '<img>','{image}', '{caption}', '{mask}', '{ref_image}', '{image1}', '{image2}', '{image3}', '{image4}', '{image5}', '{image6}', '{image7}', '{image8}', '{image9}' ]
-      # MAX_LENGTH DESCRIPTION: max length of input TYPE: int default: 77
-      MAX_LENGTH: 512
-      # OUTPUT_KEY DESCRIPTION: output key TYPE: str default: 'last_hidden_state'
-      OUTPUT_KEY: last_hidden_state
-      # D_TYPE DESCRIPTION: dtype TYPE: str default: 'bfloat16'
-      D_TYPE: bfloat16
-      # BATCH_INFER DESCRIPTION: batch infer TYPE: bool default: False
-      BATCH_INFER: False
-      CLEAN: whitespace
-    # CLIP_MODEL DESCRIPTION:  TYPE:  default: ''
-    CLIP_MODEL:
-      # NAME DESCRIPTION:  TYPE:  default: 'HFEmbedder'
-      NAME: ACEHFEmbedder
-      # HF_MODEL_CLS DESCRIPTION: huggingface cls in transfomer TYPE: NoneType default: None
-      HF_MODEL_CLS: CLIPTextModel
-      # MODEL_PATH DESCRIPTION: model folder path TYPE: NoneType default: None
-      MODEL_PATH: hf://black-forest-labs/FLUX.1-dev@text_encoder/
-      # HF_TOKENIZER_CLS DESCRIPTION: huggingface cls in transfomer TYPE: NoneType default: None
-      HF_TOKENIZER_CLS: CLIPTokenizer
-      # TOKENIZER_PATH DESCRIPTION: tokenizer folder path TYPE: NoneType default: None
-      TOKENIZER_PATH: hf://black-forest-labs/FLUX.1-dev@tokenizer/
-      # MAX_LENGTH DESCRIPTION: max length of input TYPE: int default: 77
-      MAX_LENGTH: 77
-      # OUTPUT_KEY DESCRIPTION: output key TYPE: str default: 'last_hidden_state'
-      OUTPUT_KEY: pooler_output
-      # D_TYPE DESCRIPTION: dtype TYPE: str default: 'bfloat16'
-      D_TYPE: bfloat16
-      # BATCH_INFER DESCRIPTION: batch infer TYPE: bool default: False
-      BATCH_INFER: True
-      CLEAN: whitespace

example.py DELETED Viewed

@@ -1,370 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import os
-from PIL import Image
-from scepter.modules.utils.file_system import FS
-def download_image(image, local_path=None):
-    if not FS.exists(local_path):
-        local_path = FS.get_from(image, local_path=local_path)
-        if local_path.split(".")[-1] in ['jpg', 'jpeg']:
-            im = Image.open(local_path).convert("RGB")
-            im.save(local_path, format='JPEG')
-    return local_path
-def get_examples(cache_dir):
-    print('Downloading Examples ...')
-    examples = [
-        [
-            'Facial Editing',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/e33edc106953.png?raw=true',
-                os.path.join(cache_dir, 'examples/e33edc106953.jpg')), None,
-            None, '{image} let the man smile', 6666
-        ],
-        [
-            'Facial Editing',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/5d2bcc91a3e9.png?raw=true',
-                os.path.join(cache_dir, 'examples/5d2bcc91a3e9.jpg')), None,
-            None, 'let the man in {image} wear sunglasses', 9999
-        ],
-        [
-            'Facial Editing',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/3a52eac708bd.png?raw=true',
-                os.path.join(cache_dir, 'examples/3a52eac708bd.jpg')), None,
-            None, '{image} red hair', 9999
-        ],
-        [
-            'Facial Editing',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/3f4dc464a0ea.png?raw=true',
-                os.path.join(cache_dir, 'examples/3f4dc464a0ea.jpg')), None,
-            None, '{image} let the man serious', 99999
-        ],
-        [
-            'Controllable Generation',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/131ca90fd2a9.png?raw=true',
-                os.path.join(cache_dir,
-                             'examples/131ca90fd2a9.jpg')), None, None,
-            '"A person sits contemplatively on the ground, surrounded by falling autumn leaves. Dressed in a green sweater and dark blue pants, they rest their chin on their hand, exuding a relaxed demeanor. Their stylish checkered slip-on shoes add a touch of flair, while a black purse lies in their lap. The backdrop of muted brown enhances the warm, cozy atmosphere of the scene." , generate the image that corresponds to the given scribble {image}.',
-            613725
-        ],
-        [
-            'Render Text',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/33e9f27c2c48.png?raw=true',
-                os.path.join(cache_dir, 'examples/33e9f27c2c48.jpg')),
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/33e9f27c2c48_mask.png?raw=true',
-                os.path.join(cache_dir,
-                             'examples/33e9f27c2c48_mask.jpg')), None,
-            'Put the text "C A T" at the position marked by mask in the {image}',
-            6666
-        ],
-        [
-            'Style Transfer',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/9e73e7eeef55.png?raw=true',
-                os.path.join(cache_dir, 'examples/9e73e7eeef55.jpg')), None,
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/2e02975293d6.png?raw=true',
-                os.path.join(cache_dir, 'examples/2e02975293d6.jpg')),
-            'edit {image} based on the style of {image1} ', 99999
-        ],
-        [
-            'Outpainting',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/f2b22c08be3f.png?raw=true',
-                os.path.join(cache_dir, 'examples/f2b22c08be3f.jpg')),
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/f2b22c08be3f_mask.png?raw=true',
-                os.path.join(cache_dir,
-                             'examples/f2b22c08be3f_mask.jpg')), None,
-            'Could the {image} be widened within the space designated by mask, while retaining the original?',
-            6666
-        ],
-        [
-            'Image Segmentation',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/db3ebaa81899.png?raw=true',
-                os.path.join(cache_dir, 'examples/db3ebaa81899.jpg')), None,
-            None, '{image} Segmentation', 6666
-        ],
-        [
-            'Depth Estimation',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/f1927c4692ba.png?raw=true',
-                os.path.join(cache_dir, 'examples/f1927c4692ba.jpg')), None,
-            None, '{image} Depth Estimation', 6666
-        ],
-        [
-            'Pose Estimation',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/014e5bf3b4d1.png?raw=true',
-                os.path.join(cache_dir, 'examples/014e5bf3b4d1.jpg')), None,
-            None, '{image} distinguish the poses of the figures', 999999
-        ],
-        [
-            'Scribble Extraction',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/5f59a202f8ac.png?raw=true',
-                os.path.join(cache_dir, 'examples/5f59a202f8ac.jpg')), None,
-            None, 'Generate a scribble of {image}, please.', 6666
-        ],
-        [
-            'Mosaic',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/3a2f52361eea.png?raw=true',
-                os.path.join(cache_dir, 'examples/3a2f52361eea.jpg')), None,
-            None, 'Adapt {image} into a mosaic representation.', 6666
-        ],
-        [
-            'Edge map Extraction',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/b9d1e519d6e5.png?raw=true',
-                os.path.join(cache_dir, 'examples/b9d1e519d6e5.jpg')), None,
-            None, 'Get the edge-enhanced result for {image}.', 6666
-        ],
-        [
-            'Grayscale',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/c4ebbe2ba29b.png?raw=true',
-                os.path.join(cache_dir, 'examples/c4ebbe2ba29b.jpg')), None,
-            None, 'transform {image} into a black and white one', 6666
-        ],
-        [
-            'Contour Extraction',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/19652d0f6c4b.png?raw=true',
-                os.path.join(cache_dir,
-                             'examples/19652d0f6c4b.jpg')), None, None,
-            'Would you be able to make a contour picture from {image} for me?',
-            6666
-        ],
-        [
-            'Controllable Generation',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/249cda2844b7.png?raw=true',
-                os.path.join(cache_dir,
-                             'examples/249cda2844b7.jpg')), None, None,
-            'Following the segmentation outcome in mask of {image}, develop a real-life image using the explanatory note in "a mighty cat lying on the bed”.',
-            6666
-        ],
-        [
-            'Controllable Generation',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/411f6c4b8e6c.png?raw=true',
-                os.path.join(cache_dir,
-                             'examples/411f6c4b8e6c.jpg')), None, None,
-            'use the depth map {image} and the text caption "a cut white cat" to create a corresponding graphic image',
-            999999
-        ],
-        [
-            'Controllable Generation',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/a35c96ed137a.png?raw=true',
-                os.path.join(cache_dir,
-                             'examples/a35c96ed137a.jpg')), None, None,
-            'help translate this posture schema {image} into a colored image based on the context I provided "A beautiful woman Climbing the climbing wall, wearing a harness and climbing gear, skillfully maneuvering up the wall with her back to the camera, with a safety rope."',
-            3599999
-        ],
-        [
-            'Controllable Generation',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/dcb2fc86f1ce.png?raw=true',
-                os.path.join(cache_dir,
-                             'examples/dcb2fc86f1ce.jpg')), None, None,
-            'Transform and generate an image using mosaic {image} and "Monarch butterflies gracefully perch on vibrant purple flowers, showcasing their striking orange and black wings in a lush garden setting." description',
-            6666
-        ],
-        [
-            'Controllable Generation',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/4cd4ee494962.png?raw=true',
-                os.path.join(cache_dir,
-                             'examples/4cd4ee494962.jpg')), None, None,
-            'make this {image} colorful as per the "beautiful sunflowers"',
-            6666
-        ],
-        [
-            'Controllable Generation',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/a47e3a9cd166.png?raw=true',
-                os.path.join(cache_dir,
-                             'examples/a47e3a9cd166.jpg')), None, None,
-            'Take the edge conscious {image} and the written guideline "A whimsical animated character is depicted holding a delectable cake adorned with blue and white frosting and a drizzle of chocolate. The character wears a yellow headband with a bow, matching a cozy yellow sweater. Her dark hair is styled in a braid, tied with a yellow ribbon. With a golden fork in hand, she stands ready to enjoy a slice, exuding an air of joyful anticipation. The scene is creatively rendered with a charming and playful aesthetic." and produce a realistic image.',
-            613725
-        ],
-        [
-            'Controllable Generation',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/d890ed8a3ac2.png?raw=true',
-                os.path.join(cache_dir,
-                             'examples/d890ed8a3ac2.jpg')), None, None,
-            'creating a vivid image based on {image} and description "This image features a delicious rectangular tart with a flaky, golden-brown crust. The tart is topped with evenly sliced tomatoes, layered over a creamy cheese filling. Aromatic herbs are sprinkled on top, adding a touch of green and enhancing the visual appeal. The background includes a soft, textured fabric and scattered white flowers, creating an elegant and inviting presentation. Bright red tomatoes in the upper right corner hint at the fresh ingredients used in the dish."',
-            6666
-        ],
-        [
-            'Image Denoising',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/0844a686a179.png?raw=true',
-                os.path.join(cache_dir,
-                             'examples/0844a686a179.jpg')), None, None,
-            'Eliminate noise interference in {image} and maximize the crispness to obtain superior high-definition quality',
-            6666
-        ],
-        [
-            'Inpainting',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/fa91b6b7e59b.png?raw=true',
-                os.path.join(cache_dir, 'examples/fa91b6b7e59b.jpg')),
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/fa91b6b7e59b_mask.png?raw=true',
-                os.path.join(cache_dir,
-                             'examples/fa91b6b7e59b_mask.jpg')), None,
-            'Ensure to overhaul the parts of the {image} indicated by the mask.',
-            6666
-        ],
-        [
-            'Inpainting',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/632899695b26.png?raw=true',
-                os.path.join(cache_dir, 'examples/632899695b26.jpg')),
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/632899695b26_mask.png?raw=true',
-                os.path.join(cache_dir,
-                             'examples/632899695b26_mask.jpg')), None,
-            'Refashion the mask portion of {image} in accordance with "A yellow egg with a smiling face painted on it"',
-            6666
-        ],
-        [
-            'General Editing',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/354d17594afe.png?raw=true',
-                os.path.join(cache_dir,
-                             'examples/354d17594afe.jpg')), None, None,
-            '{image} change the dog\'s posture to walking in the water, and change the background to green plants and a pond.',
-            6666
-        ],
-        [
-            'General Editing',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/38946455752b.png?raw=true',
-                os.path.join(cache_dir,
-                             'examples/38946455752b.jpg')), None, None,
-            '{image} change the color of the dress from white to red and the model\'s hair color red brown to blonde.Other parts remain unchanged',
-            6669
-        ],
-        [
-            'Facial Editing',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/3ba5202f0cd8.png?raw=true',
-                os.path.join(cache_dir,
-                             'examples/3ba5202f0cd8.jpg')), None, None,
-            'Keep the same facial feature in @3ba5202f0cd8, change the woman\'s clothing from a Blue denim jacket to a white turtleneck sweater and adjust her posture so that she is supporting her chin with both hands. Other aspects, such as background, hairstyle, facial expression, etc, remain unchanged.',
-            99999
-        ],
-        [
-            'Facial Editing',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/369365b94725.png?raw=true',
-                os.path.join(cache_dir, 'examples/369365b94725.jpg')), None,
-            None, '{image} Make her looking at the camera', 6666
-        ],
-        [
-            'Facial Editing',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/92751f2e4a0e.png?raw=true',
-                os.path.join(cache_dir, 'examples/92751f2e4a0e.jpg')), None,
-            None, '{image} Remove the smile from his face', 9899999
-        ],
-        [
-            'Remove Text',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/8530a6711b2e.png?raw=true',
-                os.path.join(cache_dir, 'examples/8530a6711b2e.jpg')), None,
-            None, 'Aim to remove any textual element in {image}', 6666
-        ],
-        [
-            'Remove Text',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/c4d7fb28f8f6.png?raw=true',
-                os.path.join(cache_dir, 'examples/c4d7fb28f8f6.jpg')),
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/c4d7fb28f8f6_mask.png?raw=true',
-                os.path.join(cache_dir,
-                             'examples/c4d7fb28f8f6_mask.jpg')), None,
-            'Rub out any text found in the mask sector of the {image}.', 6666
-        ],
-        [
-            'Remove Object',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/e2f318fa5e5b.png?raw=true',
-                os.path.join(cache_dir,
-                             'examples/e2f318fa5e5b.jpg')), None, None,
-            'Remove the unicorn in this {image}, ensuring a smooth edit.',
-            99999
-        ],
-        [
-            'Remove Object',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/1ae96d8aca00.png?raw=true',
-                os.path.join(cache_dir, 'examples/1ae96d8aca00.jpg')),
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/1ae96d8aca00_mask.png?raw=true',
-                os.path.join(cache_dir, 'examples/1ae96d8aca00_mask.jpg')),
-            None, 'Discard the contents of the mask area from {image}.', 99999
-        ],
-        [
-            'Add Object',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/80289f48e511.png?raw=true',
-                os.path.join(cache_dir, 'examples/80289f48e511.jpg')),
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/80289f48e511_mask.png?raw=true',
-                os.path.join(cache_dir,
-                             'examples/80289f48e511_mask.jpg')), None,
-            'add a Hot Air Balloon into the {image}, per the mask', 613725
-        ],
-        [
-            'Style Transfer',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/d725cb2009e8.png?raw=true',
-                os.path.join(cache_dir, 'examples/d725cb2009e8.jpg')), None,
-            None, 'Change the style of {image} to colored pencil style', 99999
-        ],
-        [
-            'Style Transfer',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/e0f48b3fd010.png?raw=true',
-                os.path.join(cache_dir, 'examples/e0f48b3fd010.jpg')), None,
-            None, 'make {image} to Walt Disney Animation style', 99999
-        ],
-        [
-            'Try On',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/ee4ca60b8c96.png?raw=true',
-                os.path.join(cache_dir, 'examples/ee4ca60b8c96.jpg')),
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/ee4ca60b8c96_mask.png?raw=true',
-                os.path.join(cache_dir, 'examples/ee4ca60b8c96_mask.jpg')),
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/ebe825bbfe3c.png?raw=true',
-                os.path.join(cache_dir, 'examples/ebe825bbfe3c.jpg')),
-            'Change the cloth in {image} to the one in {image1}', 99999
-        ],
-        [
-            'Workflow',
-            download_image(
-                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/cb85353c004b.png?raw=true',
-                os.path.join(cache_dir, 'examples/cb85353c004b.jpg')), None,
-            None, '<workflow> ice cream {image}', 99999
-        ],
-    ]
-    print('Finish. Start building UI ...')
-    return examples

models/__init__.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- from .flux import Flux, ACEFlux
2	- from .embedder import ACETextEmbedder, T5ACEPlusClipFluxEmbedder, ACEHFEmbedder

models/embedder.py DELETED Viewed

@@ -1,383 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import warnings
-from contextlib import nullcontext
-import torch
-import torch.nn.functional as F
-import torch.utils.dlpack
-import transformers
-from scepter.modules.model.embedder.base_embedder import BaseEmbedder
-from scepter.modules.model.registry import EMBEDDERS
-from scepter.modules.model.tokenizer.tokenizer_component import (
-    basic_clean, canonicalize, heavy_clean, whitespace_clean)
-from scepter.modules.utils.config import dict_to_yaml
-from scepter.modules.utils.distribute import we
-from scepter.modules.utils.file_system import FS
-try:
-    from transformers import AutoTokenizer, T5EncoderModel
-except Exception as e:
-    warnings.warn(
-        f'Import transformers error, please deal with this problem: {e}')
-@EMBEDDERS.register_class()
-class ACETextEmbedder(BaseEmbedder):
-    """
-    Uses the OpenCLIP transformer encoder for text
-    """
-    """
-        Uses the OpenCLIP transformer encoder for text
-        """
-    para_dict = {
-        'PRETRAINED_MODEL': {
-            'value':
-            'google/umt5-small',
-            'description':
-            'Pretrained Model for umt5, modelcard path or local path.'
-        },
-        'TOKENIZER_PATH': {
-            'value': 'google/umt5-small',
-            'description':
-            'Tokenizer Path for umt5, modelcard path or local path.'
-        },
-        'FREEZE': {
-            'value': True,
-            'description': ''
-        },
-        'USE_GRAD': {
-            'value': False,
-            'description': 'Compute grad or not.'
-        },
-        'CLEAN': {
-            'value':
-            'whitespace',
-            'description':
-            'Set the clean strtegy for tokenizer, used when TOKENIZER_PATH is not None.'
-        },
-        'LAYER': {
-            'value': 'last',
-            'description': ''
-        },
-        'LEGACY': {
-            'value':
-            True,
-            'description':
-            'Whether use legacy returnd feature or not ,default True.'
-        }
-    }
-    def __init__(self, cfg, logger=None):
-        super().__init__(cfg, logger=logger)
-        pretrained_path = cfg.get('PRETRAINED_MODEL', None)
-        self.t5_dtype = cfg.get('T5_DTYPE', 'float32')
-        assert pretrained_path
-        with FS.get_dir_to_local_dir(pretrained_path,
-                                     wait_finish=True) as local_path:
-            self.model = T5EncoderModel.from_pretrained(
-                local_path,
-                torch_dtype=getattr(
-                    torch,
-                    'float' if self.t5_dtype == 'float32' else self.t5_dtype))
-        tokenizer_path = cfg.get('TOKENIZER_PATH', None)
-        self.length = cfg.get('LENGTH', 77)
-        self.use_grad = cfg.get('USE_GRAD', False)
-        self.clean = cfg.get('CLEAN', 'whitespace')
-        self.added_identifier = cfg.get('ADDED_IDENTIFIER', None)
-        if tokenizer_path:
-            self.tokenize_kargs = {'return_tensors': 'pt'}
-            with FS.get_dir_to_local_dir(tokenizer_path,
-                                         wait_finish=True) as local_path:
-                if self.added_identifier is not None and isinstance(
-                        self.added_identifier, list):
-                    self.tokenizer = AutoTokenizer.from_pretrained(local_path)
-                else:
-                    self.tokenizer = AutoTokenizer.from_pretrained(local_path)
-            if self.length is not None:
-                self.tokenize_kargs.update({
-                    'padding': 'max_length',
-                    'truncation': True,
-                    'max_length': self.length
-                })
-            self.eos_token = self.tokenizer(
-                self.tokenizer.eos_token)['input_ids'][0]
-        else:
-            self.tokenizer = None
-            self.tokenize_kargs = {}
-        self.use_grad = cfg.get('USE_GRAD', False)
-        self.clean = cfg.get('CLEAN', 'whitespace')
-    def freeze(self):
-        self.model = self.model.eval()
-        for param in self.parameters():
-            param.requires_grad = False
-    # encode && encode_text
-    def forward(self, tokens, return_mask=False, use_mask=True):
-        # tokenization
-        embedding_context = nullcontext if self.use_grad else torch.no_grad
-        with embedding_context():
-            if use_mask:
-                x = self.model(tokens.input_ids.to(we.device_id),
-                               tokens.attention_mask.to(we.device_id))
-            else:
-                x = self.model(tokens.input_ids.to(we.device_id))
-            x = x.last_hidden_state
-            if return_mask:
-                return x.detach() + 0.0, tokens.attention_mask.to(we.device_id)
-            else:
-                return x.detach() + 0.0, None
-    def _clean(self, text):
-        if self.clean == 'whitespace':
-            text = whitespace_clean(basic_clean(text))
-        elif self.clean == 'lower':
-            text = whitespace_clean(basic_clean(text)).lower()
-        elif self.clean == 'canonicalize':
-            text = canonicalize(basic_clean(text))
-        elif self.clean == 'heavy':
-            text = heavy_clean(basic_clean(text))
-        return text
-    def encode(self, text, return_mask=False, use_mask=True):
-        if isinstance(text, str):
-            text = [text]
-        if self.clean:
-            text = [self._clean(u) for u in text]
-        assert self.tokenizer is not None
-        cont, mask = [], []
-        with torch.autocast(device_type='cuda',
-                            enabled=self.t5_dtype in ('float16', 'bfloat16'),
-                            dtype=getattr(torch, self.t5_dtype)):
-            for tt in text:
-                tokens = self.tokenizer([tt], **self.tokenize_kargs)
-                one_cont, one_mask = self(tokens,
-                                          return_mask=return_mask,
-                                          use_mask=use_mask)
-                cont.append(one_cont)
-                mask.append(one_mask)
-        if return_mask:
-            return torch.cat(cont, dim=0), torch.cat(mask, dim=0)
-        else:
-            return torch.cat(cont, dim=0)
-    def encode_list(self, text_list, return_mask=True):
-        cont_list = []
-        mask_list = []
-        for pp in text_list:
-            cont, cont_mask = self.encode(pp, return_mask=return_mask)
-            cont_list.append(cont)
-            mask_list.append(cont_mask)
-        if return_mask:
-            return cont_list, mask_list
-        else:
-            return cont_list
-    @staticmethod
-    def get_config_template():
-        return dict_to_yaml('MODELS',
-                            __class__.__name__,
-                            ACETextEmbedder.para_dict,
-                            set_name=True)
-@EMBEDDERS.register_class()
-class ACEHFEmbedder(BaseEmbedder):
-    para_dict = {
-        "HF_MODEL_CLS": {
-            "value": None,
-            "description": "huggingface cls in transfomer"
-        },
-        "MODEL_PATH": {
-            "value": None,
-            "description": "model folder path"
-        },
-        "HF_TOKENIZER_CLS": {
-            "value": None,
-            "description": "huggingface cls in transfomer"
-        },
-        "TOKENIZER_PATH": {
-            "value": None,
-            "description": "tokenizer folder path"
-        },
-        "MAX_LENGTH": {
-            "value": 77,
-            "description": "max length of input"
-        },
-        "OUTPUT_KEY": {
-            "value": "last_hidden_state",
-            "description": "output key"
-        },
-        "D_TYPE": {
-            "value": "float",
-            "description": "dtype"
-        },
-        "BATCH_INFER": {
-            "value": False,
-            "description": "batch infer"
-        }
-    }
-    para_dict.update(BaseEmbedder.para_dict)
-    def __init__(self, cfg, logger=None):
-        super().__init__(cfg, logger=logger)
-        hf_model_cls = cfg.get('HF_MODEL_CLS', None)
-        model_path = cfg.get("MODEL_PATH", None)
-        hf_tokenizer_cls = cfg.get('HF_TOKENIZER_CLS', None)
-        tokenizer_path = cfg.get('TOKENIZER_PATH', None)
-        self.max_length = cfg.get('MAX_LENGTH', 77)
-        self.output_key = cfg.get("OUTPUT_KEY", "last_hidden_state")
-        self.d_type = cfg.get("D_TYPE", "float")
-        self.clean = cfg.get("CLEAN", "whitespace")
-        self.batch_infer = cfg.get("BATCH_INFER", False)
-        self.added_identifier = cfg.get('ADDED_IDENTIFIER', None)
-        torch_dtype = getattr(torch, self.d_type)
-        assert hf_model_cls is not None and hf_tokenizer_cls is not None
-        assert model_path is not None and tokenizer_path is not None
-        with FS.get_dir_to_local_dir(tokenizer_path, wait_finish=True) as local_path:
-            self.tokenizer = getattr(transformers, hf_tokenizer_cls).from_pretrained(local_path,
-                                                                                     max_length = self.max_length,
-                                                                                     torch_dtype = torch_dtype,
-                                                                                     additional_special_tokens=self.added_identifier)
-        with FS.get_dir_to_local_dir(model_path, wait_finish=True) as local_path:
-            self.hf_module = getattr(transformers, hf_model_cls).from_pretrained(local_path, torch_dtype = torch_dtype)
-        self.hf_module = self.hf_module.eval().requires_grad_(False)
-    def forward(self, text: list[str], return_mask = False):
-        batch_encoding = self.tokenizer(
-            text,
-            truncation=True,
-            max_length=self.max_length,
-            return_length=False,
-            return_overflowing_tokens=False,
-            padding="max_length",
-            return_tensors="pt",
-        )
-        outputs = self.hf_module(
-            input_ids=batch_encoding["input_ids"].to(self.hf_module.device),
-            attention_mask=None,
-            output_hidden_states=False,
-        )
-        if return_mask:
-            return outputs[self.output_key], batch_encoding['attention_mask'].to(self.hf_module.device)
-        else:
-            return outputs[self.output_key], None
-    def encode(self, text, return_mask = False):
-        if isinstance(text, str):
-            text = [text]
-        if self.clean:
-            text = [self._clean(u) for u in text]
-        if not self.batch_infer:
-            cont, mask = [], []
-            for tt in text:
-                one_cont, one_mask = self([tt], return_mask=return_mask)
-                cont.append(one_cont)
-                mask.append(one_mask)
-            if return_mask:
-                return torch.cat(cont, dim=0), torch.cat(mask, dim=0)
-            else:
-                return torch.cat(cont, dim=0)
-        else:
-            ret_data = self(text, return_mask = return_mask)
-            if return_mask:
-                return ret_data
-            else:
-                return ret_data[0]
-    def encode_list(self, text_list, return_mask=True):
-        cont_list = []
-        mask_list = []
-        for pp in text_list:
-            cont = self.encode(pp, return_mask=return_mask)
-            cont_list.append(cont[0]) if return_mask else cont_list.append(cont)
-            mask_list.append(cont[1]) if return_mask else mask_list.append(None)
-        if return_mask:
-            return cont_list, mask_list
-        else:
-            return cont_list
-    def encode_list_of_list(self, text_list, return_mask=True):
-        cont_list = []
-        mask_list = []
-        for pp in text_list:
-            cont = self.encode_list(pp, return_mask=return_mask)
-            cont_list.append(cont[0]) if return_mask else cont_list.append(cont)
-            mask_list.append(cont[1]) if return_mask else mask_list.append(None)
-        if return_mask:
-            return cont_list, mask_list
-        else:
-            return cont_list
-    def _clean(self, text):
-        if self.clean == 'whitespace':
-            text = whitespace_clean(basic_clean(text))
-        elif self.clean == 'lower':
-            text = whitespace_clean(basic_clean(text)).lower()
-        elif self.clean == 'canonicalize':
-            text = canonicalize(basic_clean(text))
-        return text
-    @staticmethod
-    def get_config_template():
-        return dict_to_yaml('EMBEDDER',
-                            __class__.__name__,
-                            ACEHFEmbedder.para_dict,
-                            set_name=True)
-@EMBEDDERS.register_class()
-class T5ACEPlusClipFluxEmbedder(BaseEmbedder):
-    """
-    Uses the OpenCLIP transformer encoder for text
-    """
-    para_dict = {
-        'T5_MODEL': {},
-        'CLIP_MODEL': {}
-    }
-    def __init__(self, cfg, logger=None):
-        super().__init__(cfg, logger=logger)
-        self.t5_model = EMBEDDERS.build(cfg.T5_MODEL, logger=logger)
-        self.clip_model = EMBEDDERS.build(cfg.CLIP_MODEL, logger=logger)
-    def encode(self, text, return_mask = False):
-        t5_embeds = self.t5_model.encode(text, return_mask = return_mask)
-        clip_embeds = self.clip_model.encode(text, return_mask = return_mask)
-        # change embedding strategy here
-        return {
-            'context': t5_embeds,
-            'y': clip_embeds,
-        }
-    def encode_list(self, text, return_mask = False):
-        t5_embeds = self.t5_model.encode_list(text, return_mask = return_mask)
-        clip_embeds = self.clip_model.encode_list(text, return_mask = return_mask)
-        # change embedding strategy here
-        return {
-            'context': t5_embeds,
-            'y': clip_embeds,
-        }
-    def encode_list_of_list(self, text, return_mask = False):
-        t5_embeds = self.t5_model.encode_list_of_list(text, return_mask = return_mask)
-        clip_embeds = self.clip_model.encode_list_of_list(text, return_mask = return_mask)
-        # change embedding strategy here
-        return {
-            'context': t5_embeds,
-            'y': clip_embeds,
-        }
-    @staticmethod
-    def get_config_template():
-        return dict_to_yaml('EMBEDDER',
-                            __class__.__name__,
-                            T5ACEPlusClipFluxEmbedder.para_dict,
-                            set_name=True)

models/flux.py DELETED Viewed

@@ -1,798 +0,0 @@
-import math, torch
-from collections import OrderedDict
-from functools import partial
-from einops import rearrange, repeat
-from scepter.modules.model.base_model import BaseModel
-from scepter.modules.model.registry import BACKBONES
-from scepter.modules.utils.config import dict_to_yaml
-from scepter.modules.utils.distribute import we
-from scepter.modules.utils.file_system import FS
-from torch import Tensor, nn
-from torch.nn.utils.rnn import pad_sequence
-from torch.utils.checkpoint import checkpoint_sequential
-from .layers import (DoubleStreamBlock, EmbedND, LastLayer,
-                                 MLPEmbedder, SingleStreamBlock,
-                                 timestep_embedding, DoubleStreamBlockACE, SingleStreamBlockACE)
-@BACKBONES.register_class()
-class Flux(BaseModel):
-    """
-    Transformer backbone Diffusion model with RoPE.
-    """
-    para_dict = {
-        "IN_CHANNELS": {
-            "value": 64,
-            "description": "model's input channels."
-        },
-        "OUT_CHANNELS": {
-            "value": 64,
-            "description": "model's output channels."
-        },
-        "HIDDEN_SIZE": {
-            "value": 1024,
-            "description": "model's hidden size."
-        },
-        "NUM_HEADS": {
-            "value": 16,
-            "description": "number of heads in the transformer."
-        },
-        "AXES_DIM": {
-            "value": [16, 56, 56],
-            "description": "dimensions of the axes of the positional encoding."
-        },
-        "THETA": {
-            "value": 10_000,
-            "description": "theta for positional encoding."
-        },
-        "VEC_IN_DIM": {
-            "value": 768,
-            "description": "dimension of the vector input."
-        },
-        "GUIDANCE_EMBED": {
-            "value": False,
-            "description": "whether to use guidance embedding."
-        },
-        "CONTEXT_IN_DIM": {
-            "value": 4096,
-            "description": "dimension of the context input."
-        },
-        "MLP_RATIO": {
-            "value": 4.0,
-            "description": "ratio of mlp hidden size to hidden size."
-        },
-        "QKV_BIAS": {
-            "value": True,
-            "description": "whether to use bias in qkv projection."
-        },
-        "DEPTH": {
-            "value": 19,
-            "description": "number of transformer blocks."
-        },
-        "DEPTH_SINGLE_BLOCKS": {
-            "value": 38,
-            "description": "number of transformer blocks in the single stream block."
-        },
-        "USE_GRAD_CHECKPOINT": {
-            "value": False,
-            "description": "whether to use gradient checkpointing."
-        },
-        "ATTN_BACKEND": {
-            "value": "pytorch",
-            "description": "backend for the transformer blocks, 'pytorch' or 'flash_attn'."
-        }
-    }
-    def __init__(
-            self,
-            cfg,
-            logger = None
-    ):
-        super().__init__(cfg, logger=logger)
-        self.in_channels = cfg.IN_CHANNELS
-        self.out_channels = cfg.get("OUT_CHANNELS", self.in_channels)
-        hidden_size = cfg.get("HIDDEN_SIZE", 1024)
-        num_heads = cfg.get("NUM_HEADS", 16)
-        axes_dim = cfg.AXES_DIM
-        theta = cfg.THETA
-        vec_in_dim = cfg.VEC_IN_DIM
-        self.guidance_embed = cfg.GUIDANCE_EMBED
-        context_in_dim = cfg.CONTEXT_IN_DIM
-        mlp_ratio = cfg.MLP_RATIO
-        qkv_bias = cfg.QKV_BIAS
-        depth = cfg.DEPTH
-        depth_single_blocks = cfg.DEPTH_SINGLE_BLOCKS
-        self.use_grad_checkpoint = cfg.get("USE_GRAD_CHECKPOINT", False)
-        self.attn_backend = cfg.get("ATTN_BACKEND", "pytorch")
-        self.lora_model = cfg.get("DIFFUSERS_LORA_MODEL", None)
-        self.swift_lora_model = cfg.get("SWIFT_LORA_MODEL", None)
-        self.blackforest_lora_model = cfg.get("BLACKFOREST_LORA_MODEL", None)
-        self.pretrain_adapter = cfg.get("PRETRAIN_ADAPTER", None)
-        if hidden_size % num_heads != 0:
-            raise ValueError(
-                f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}"
-            )
-        pe_dim = hidden_size // num_heads
-        if sum(axes_dim) != pe_dim:
-            raise ValueError(f"Got {axes_dim} but expected positional dim {pe_dim}")
-        self.hidden_size = hidden_size
-        self.num_heads = num_heads
-        self.pe_embedder = EmbedND(dim=pe_dim, theta=theta, axes_dim= axes_dim)
-        self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
-        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
-        self.vector_in = MLPEmbedder(vec_in_dim, self.hidden_size)
-        self.guidance_in = (
-            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if self.guidance_embed else nn.Identity()
-        )
-        self.txt_in = nn.Linear(context_in_dim, self.hidden_size)
-        self.double_blocks = nn.ModuleList(
-            [
-                DoubleStreamBlock(
-                    self.hidden_size,
-                    self.num_heads,
-                    mlp_ratio=mlp_ratio,
-                    qkv_bias=qkv_bias,
-                    backend=self.attn_backend
-                )
-                for _ in range(depth)
-            ]
-        )
-        self.single_blocks = nn.ModuleList(
-            [
-                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=mlp_ratio, backend=self.attn_backend)
-                for _ in range(depth_single_blocks)
-            ]
-        )
-        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
-    def prepare_input(self, x, context, y, x_shape=None):
-        # x.shape [6, 16, 16, 16] target is [6, 16, 768, 1360]
-        bs, c, h, w = x.shape
-        x = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
-        x_id = torch.zeros(h // 2, w // 2, 3)
-        x_id[..., 1] = x_id[..., 1] + torch.arange(h // 2)[:, None]
-        x_id[..., 2] = x_id[..., 2] + torch.arange(w // 2)[None, :]
-        x_ids = repeat(x_id, "h w c -> b (h w) c", b=bs)
-        txt_ids = torch.zeros(bs, context.shape[1], 3)
-        return x, x_ids.to(x), context.to(x), txt_ids.to(x), y.to(x), h, w
-    def unpack(self, x: Tensor, height: int, width: int) -> Tensor:
-        return rearrange(
-            x,
-            "b (h w) (c ph pw) -> b c (h ph) (w pw)",
-            h=math.ceil(height/2),
-            w=math.ceil(width/2),
-            ph=2,
-            pw=2,
-        )
-    # def merge_diffuser_lora(self, ori_sd, lora_sd, scale = 1.0):
-    #     key_map = {
-    #         "single_blocks.{}.linear1.weight": {"key_list": [
-    #             ["transformer.single_transformer_blocks.{}.attn.to_q.lora_A.weight",
-    #              "transformer.single_transformer_blocks.{}.attn.to_q.lora_B.weight"],
-    #             ["transformer.single_transformer_blocks.{}.attn.to_k.lora_A.weight",
-    #              "transformer.single_transformer_blocks.{}.attn.to_k.lora_B.weight"],
-    #             ["transformer.single_transformer_blocks.{}.attn.to_v.lora_A.weight",
-    #              "transformer.single_transformer_blocks.{}.attn.to_v.lora_B.weight"],
-    #             ["transformer.single_transformer_blocks.{}.proj_mlp.lora_A.weight",
-    #              "transformer.single_transformer_blocks.{}.proj_mlp.lora_B.weight"]
-    #         ], "num": 38},
-    #         "single_blocks.{}.modulation.lin.weight": {"key_list": [
-    #             ["transformer.single_transformer_blocks.{}.norm.linear.lora_A.weight",
-    #              "transformer.single_transformer_blocks.{}.norm.linear.lora_B.weight"],
-    #         ], "num": 38},
-    #         "single_blocks.{}.linear2.weight": {"key_list": [
-    #             ["transformer.single_transformer_blocks.{}.proj_out.lora_A.weight",
-    #              "transformer.single_transformer_blocks.{}.proj_out.lora_B.weight"],
-    #         ], "num": 38},
-    #         "double_blocks.{}.txt_attn.qkv.weight": {"key_list": [
-    #             ["transformer.transformer_blocks.{}.attn.add_q_proj.lora_A.weight",
-    #              "transformer.transformer_blocks.{}.attn.add_q_proj.lora_B.weight"],
-    #             ["transformer.transformer_blocks.{}.attn.add_k_proj.lora_A.weight",
-    #              "transformer.transformer_blocks.{}.attn.add_k_proj.lora_B.weight"],
-    #             ["transformer.transformer_blocks.{}.attn.add_v_proj.lora_A.weight",
-    #              "transformer.transformer_blocks.{}.attn.add_v_proj.lora_B.weight"],
-    #         ], "num": 19},
-    #         "double_blocks.{}.img_attn.qkv.weight": {"key_list": [
-    #             ["transformer.transformer_blocks.{}.attn.to_q.lora_A.weight",
-    #              "transformer.transformer_blocks.{}.attn.to_q.lora_B.weight"],
-    #             ["transformer.transformer_blocks.{}.attn.to_k.lora_A.weight",
-    #              "transformer.transformer_blocks.{}.attn.to_k.lora_B.weight"],
-    #             ["transformer.transformer_blocks.{}.attn.to_v.lora_A.weight",
-    #              "transformer.transformer_blocks.{}.attn.to_v.lora_B.weight"],
-    #         ], "num": 19},
-    #         "double_blocks.{}.img_attn.proj.weight": {"key_list": [
-    #             ["transformer.transformer_blocks.{}.attn.to_out.0.lora_A.weight",
-    #              "transformer.transformer_blocks.{}.attn.to_out.0.lora_B.weight"]
-    #         ], "num": 19},
-    #         "double_blocks.{}.txt_attn.proj.weight": {"key_list": [
-    #             ["transformer.transformer_blocks.{}.attn.to_add_out.lora_A.weight",
-    #              "transformer.transformer_blocks.{}.attn.to_add_out.lora_B.weight"]
-    #         ], "num": 19},
-    #         "double_blocks.{}.img_mlp.0.weight": {"key_list": [
-    #             ["transformer.transformer_blocks.{}.ff.net.0.proj.lora_A.weight",
-    #              "transformer.transformer_blocks.{}.ff.net.0.proj.lora_B.weight"]
-    #         ], "num": 19},
-    #         "double_blocks.{}.img_mlp.2.weight": {"key_list": [
-    #             ["transformer.transformer_blocks.{}.ff.net.2.lora_A.weight",
-    #              "transformer.transformer_blocks.{}.ff.net.2.lora_B.weight"]
-    #         ], "num": 19},
-    #         "double_blocks.{}.txt_mlp.0.weight": {"key_list": [
-    #             ["transformer.transformer_blocks.{}.ff_context.net.0.proj.lora_A.weight",
-    #              "transformer.transformer_blocks.{}.ff_context.net.0.proj.lora_B.weight"]
-    #         ], "num": 19},
-    #         "double_blocks.{}.txt_mlp.2.weight": {"key_list": [
-    #             ["transformer.transformer_blocks.{}.ff_context.net.2.lora_A.weight",
-    #              "transformer.transformer_blocks.{}.ff_context.net.2.lora_B.weight"]
-    #         ], "num": 19},
-    #         "double_blocks.{}.img_mod.lin.weight": {"key_list": [
-    #             ["transformer.transformer_blocks.{}.norm1.linear.lora_A.weight",
-    #              "transformer.transformer_blocks.{}.norm1.linear.lora_B.weight"]
-    #         ], "num": 19},
-    #         "double_blocks.{}.txt_mod.lin.weight": {"key_list": [
-    #             ["transformer.transformer_blocks.{}.norm1_context.linear.lora_A.weight",
-    #              "transformer.transformer_blocks.{}.norm1_context.linear.lora_B.weight"]
-    #         ], "num": 19}
-    #     }
-    #     have_lora_keys = 0
-    #     for k, v in key_map.items():
-    #         key_list = v["key_list"]
-    #         block_num = v["num"]
-    #         for block_id in range(block_num):
-    #             current_weight_list = []
-    #             for k_list in key_list:
-    #                 current_weight = torch.matmul(lora_sd[k_list[0].format(block_id)].permute(1, 0),
-    #                                               lora_sd[k_list[1].format(block_id)].permute(1, 0)).permute(1, 0)
-    #                 current_weight_list.append(current_weight)
-    #             current_weight = torch.cat(current_weight_list, dim=0)
-    #             ori_sd[k.format(block_id)] += scale*current_weight
-    #             have_lora_keys += 1
-    #     self.logger.info(f"merge_swift_lora loads lora'parameters {have_lora_keys}")
-    #     return ori_sd
-    def merge_diffuser_lora(self, ori_sd, lora_sd, scale=1.0):
-        key_map = {
-            "single_blocks.{}.linear1.weight": {"key_list": [
-                ["transformer.single_transformer_blocks.{}.attn.to_q.lora_A.weight",
-                 "transformer.single_transformer_blocks.{}.attn.to_q.lora_B.weight", [0, 3072]],
-                ["transformer.single_transformer_blocks.{}.attn.to_k.lora_A.weight",
-                 "transformer.single_transformer_blocks.{}.attn.to_k.lora_B.weight", [3072, 6144]],
-                ["transformer.single_transformer_blocks.{}.attn.to_v.lora_A.weight",
-                 "transformer.single_transformer_blocks.{}.attn.to_v.lora_B.weight", [6144, 9216]],
-                ["transformer.single_transformer_blocks.{}.proj_mlp.lora_A.weight",
-                 "transformer.single_transformer_blocks.{}.proj_mlp.lora_B.weight", [9216, 21504]]
-            ], "num": 38},
-            "single_blocks.{}.modulation.lin.weight": {"key_list": [
-                ["transformer.single_transformer_blocks.{}.norm.linear.lora_A.weight",
-                 "transformer.single_transformer_blocks.{}.norm.linear.lora_B.weight", [0, 9216]],
-            ], "num": 38},
-            "single_blocks.{}.linear2.weight": {"key_list": [
-                ["transformer.single_transformer_blocks.{}.proj_out.lora_A.weight",
-                 "transformer.single_transformer_blocks.{}.proj_out.lora_B.weight", [0, 3072]],
-            ], "num": 38},
-            "double_blocks.{}.txt_attn.qkv.weight": {"key_list": [
-                ["transformer.transformer_blocks.{}.attn.add_q_proj.lora_A.weight",
-                 "transformer.transformer_blocks.{}.attn.add_q_proj.lora_B.weight", [0, 3072]],
-                ["transformer.transformer_blocks.{}.attn.add_k_proj.lora_A.weight",
-                 "transformer.transformer_blocks.{}.attn.add_k_proj.lora_B.weight", [3072, 6144]],
-                ["transformer.transformer_blocks.{}.attn.add_v_proj.lora_A.weight",
-                 "transformer.transformer_blocks.{}.attn.add_v_proj.lora_B.weight", [6144, 9216]],
-            ], "num": 19},
-            "double_blocks.{}.img_attn.qkv.weight": {"key_list": [
-                ["transformer.transformer_blocks.{}.attn.to_q.lora_A.weight",
-                 "transformer.transformer_blocks.{}.attn.to_q.lora_B.weight", [0, 3072]],
-                ["transformer.transformer_blocks.{}.attn.to_k.lora_A.weight",
-                 "transformer.transformer_blocks.{}.attn.to_k.lora_B.weight", [3072, 6144]],
-                ["transformer.transformer_blocks.{}.attn.to_v.lora_A.weight",
-                 "transformer.transformer_blocks.{}.attn.to_v.lora_B.weight", [6144, 9216]],
-            ], "num": 19},
-            "double_blocks.{}.img_attn.proj.weight": {"key_list": [
-                ["transformer.transformer_blocks.{}.attn.to_out.0.lora_A.weight",
-                 "transformer.transformer_blocks.{}.attn.to_out.0.lora_B.weight", [0, 3072]]
-            ], "num": 19},
-            "double_blocks.{}.txt_attn.proj.weight": {"key_list": [
-                ["transformer.transformer_blocks.{}.attn.to_add_out.lora_A.weight",
-                 "transformer.transformer_blocks.{}.attn.to_add_out.lora_B.weight", [0, 3072]]
-            ], "num": 19},
-            "double_blocks.{}.img_mlp.0.weight": {"key_list": [
-                ["transformer.transformer_blocks.{}.ff.net.0.proj.lora_A.weight",
-                 "transformer.transformer_blocks.{}.ff.net.0.proj.lora_B.weight", [0, 12288]]
-            ], "num": 19},
-            "double_blocks.{}.img_mlp.2.weight": {"key_list": [
-                ["transformer.transformer_blocks.{}.ff.net.2.lora_A.weight",
-                 "transformer.transformer_blocks.{}.ff.net.2.lora_B.weight", [0, 3072]]
-            ], "num": 19},
-            "double_blocks.{}.txt_mlp.0.weight": {"key_list": [
-                ["transformer.transformer_blocks.{}.ff_context.net.0.proj.lora_A.weight",
-                 "transformer.transformer_blocks.{}.ff_context.net.0.proj.lora_B.weight", [0, 12288]]
-            ], "num": 19},
-            "double_blocks.{}.txt_mlp.2.weight": {"key_list": [
-                ["transformer.transformer_blocks.{}.ff_context.net.2.lora_A.weight",
-                 "transformer.transformer_blocks.{}.ff_context.net.2.lora_B.weight", [0, 3072]]
-            ], "num": 19},
-            "double_blocks.{}.img_mod.lin.weight": {"key_list": [
-                ["transformer.transformer_blocks.{}.norm1.linear.lora_A.weight",
-                 "transformer.transformer_blocks.{}.norm1.linear.lora_B.weight", [0, 18432]]
-            ], "num": 19},
-            "double_blocks.{}.txt_mod.lin.weight": {"key_list": [
-                ["transformer.transformer_blocks.{}.norm1_context.linear.lora_A.weight",
-                 "transformer.transformer_blocks.{}.norm1_context.linear.lora_B.weight", [0, 18432]]
-            ], "num": 19}
-        }
-        cover_lora_keys = set()
-        cover_ori_keys = set()
-        for k, v in key_map.items():
-            key_list = v["key_list"]
-            block_num = v["num"]
-            for block_id in range(block_num):
-                for k_list in key_list:
-                    if k_list[0].format(block_id) in lora_sd and k_list[1].format(block_id) in lora_sd:
-                        cover_lora_keys.add(k_list[0].format(block_id))
-                        cover_lora_keys.add(k_list[1].format(block_id))
-                        current_weight = torch.matmul(lora_sd[k_list[0].format(block_id)].permute(1, 0),
-                                                      lora_sd[k_list[1].format(block_id)].permute(1, 0)).permute(1, 0)
-                        ori_sd[k.format(block_id)][k_list[2][0]:k_list[2][1], ...] += scale * current_weight
-                        cover_ori_keys.add(k.format(block_id))
-                        # lora_sd.pop(k_list[0].format(block_id))
-                        # lora_sd.pop(k_list[1].format(block_id))
-        self.logger.info(f"merge_blackforest_lora loads lora'parameters lora-paras: \n"
-                         f"cover-{len(cover_lora_keys)} vs total {len(lora_sd)} \n"
-                         f"cover ori-{len(cover_ori_keys)} vs total {len(ori_sd)}")
-        return ori_sd
-    def merge_swift_lora(self, ori_sd, lora_sd, scale = 1.0):
-        have_lora_keys = {}
-        for k, v in lora_sd.items():
-            k = k[len("model."):] if k.startswith("model.") else k
-            ori_key = k.split("lora")[0] + "weight"
-            if ori_key not in ori_sd:
-                raise f"{ori_key} should in the original statedict"
-            if ori_key not in have_lora_keys:
-                have_lora_keys[ori_key] = {}
-            if "lora_A" in k:
-                have_lora_keys[ori_key]["lora_A"] = v
-            elif "lora_B" in k:
-                have_lora_keys[ori_key]["lora_B"] = v
-            else:
-                raise NotImplementedError
-        self.logger.info(f"merge_swift_lora loads lora'parameters {len(have_lora_keys)}")
-        for key, v in have_lora_keys.items():
-            current_weight = torch.matmul(v["lora_A"].permute(1, 0), v["lora_B"].permute(1, 0)).permute(1, 0)
-            ori_sd[key] += scale * current_weight
-        return ori_sd
-    def merge_blackforest_lora(self, ori_sd, lora_sd, scale = 1.0):
-        have_lora_keys = {}
-        cover_lora_keys = set()
-        cover_ori_keys = set()
-        for k, v in lora_sd.items():
-            if "lora" in k:
-                ori_key = k.split("lora")[0] + "weight"
-                if ori_key not in ori_sd:
-                    raise f"{ori_key} should in the original statedict"
-                if ori_key not in have_lora_keys:
-                    have_lora_keys[ori_key] = {}
-                if "lora_A" in k:
-                    have_lora_keys[ori_key]["lora_A"] = v
-                    cover_lora_keys.add(k)
-                    cover_ori_keys.add(ori_key)
-                elif "lora_B" in k:
-                    have_lora_keys[ori_key]["lora_B"] = v
-                    cover_lora_keys.add(k)
-                    cover_ori_keys.add(ori_key)
-            else:
-                if k in ori_sd:
-                    ori_sd[k] = v
-                    cover_lora_keys.add(k)
-                    cover_ori_keys.add(k)
-                else:
-                    print("unsurpport keys: ", k)
-        self.logger.info(f"merge_blackforest_lora loads lora'parameters lora-paras: \n"
-                         f"cover-{len(cover_lora_keys)} vs total {len(lora_sd)} \n"
-                         f"cover ori-{len(cover_ori_keys)} vs total {len(ori_sd)}")
-        for key, v in have_lora_keys.items():
-            current_weight = torch.matmul(v["lora_A"].permute(1, 0), v["lora_B"].permute(1, 0)).permute(1, 0)
-            # print(key, ori_sd[key].shape, current_weight.shape)
-            ori_sd[key] += scale * current_weight
-        return ori_sd
-    def load_pretrained_model(self, pretrained_model):
-        if next(self.parameters()).device.type == 'meta':
-            map_location = torch.device(we.device_id)
-            safe_device = we.device_id
-        else:
-            map_location = "cpu"
-            safe_device = "cpu"
-        if pretrained_model is not None:
-            with FS.get_from(pretrained_model, wait_finish=True) as local_model:
-                if local_model.endswith('safetensors'):
-                    from safetensors.torch import load_file as load_safetensors
-                    sd = load_safetensors(local_model, device=safe_device)
-                else:
-                    sd = torch.load(local_model, map_location=map_location, weights_only=True)
-            if "state_dict" in sd:
-                sd = sd["state_dict"]
-            if "model" in sd:
-                sd = sd["model"]["model"]
-            new_ckpt = OrderedDict()
-            for k, v in sd.items():
-                if k in ("img_in.weight"):
-                    model_p = self.state_dict()[k]
-                    if v.shape != model_p.shape:
-                        expanded_state_dict_weight = torch.zeros_like(model_p, device=v.device)
-                        slices = tuple(slice(0, dim) for dim in v.shape)
-                        expanded_state_dict_weight[slices] = v
-                        new_ckpt[k] = expanded_state_dict_weight
-                    else:
-                        new_ckpt[k] = v
-                else:
-                    new_ckpt[k] = v
-            if self.lora_model is not None:
-                with FS.get_from(self.lora_model, wait_finish=True) as local_model:
-                    if local_model.endswith('safetensors'):
-                        from safetensors.torch import load_file as load_safetensors
-                        lora_sd = load_safetensors(local_model, device=safe_device)
-                    else:
-                        lora_sd = torch.load(local_model, map_location=map_location, weights_only=True)
-                new_ckpt = self.merge_diffuser_lora(new_ckpt, lora_sd)
-            if self.swift_lora_model is not None:
-                if not isinstance(self.swift_lora_model, list):
-                    self.swift_lora_model = [self.swift_lora_model]
-                for lora_model in self.swift_lora_model:
-                    self.logger.info(f"load swift lora model: {lora_model}")
-                    with FS.get_from(lora_model, wait_finish=True) as local_model:
-                        if local_model.endswith('safetensors'):
-                            from safetensors.torch import load_file as load_safetensors
-                            lora_sd = load_safetensors(local_model, device=safe_device)
-                        else:
-                            lora_sd = torch.load(local_model, map_location=map_location, weights_only=True)
-                    new_ckpt = self.merge_swift_lora(new_ckpt, lora_sd)
-            if self.blackforest_lora_model is not None:
-                with FS.get_from(self.blackforest_lora_model, wait_finish=True) as local_model:
-                    if local_model.endswith('safetensors'):
-                        from safetensors.torch import load_file as load_safetensors
-                        lora_sd = load_safetensors(local_model, device=safe_device)
-                    else:
-                        lora_sd = torch.load(local_model, map_location=map_location, weights_only=True)
-                new_ckpt = self.merge_blackforest_lora(new_ckpt, lora_sd)
-            adapter_ckpt = {}
-            if self.pretrain_adapter is not None:
-                with FS.get_from(self.pretrain_adapter, wait_finish=True) as local_adapter:
-                    if local_adapter.endswith('safetensors'):
-                        from safetensors.torch import load_file as load_safetensors
-                        adapter_ckpt = load_safetensors(local_adapter, device=safe_device)
-                    else:
-                        adapter_ckpt = torch.load(local_adapter, map_location=map_location, weights_only=True)
-            new_ckpt.update(adapter_ckpt)
-            missing, unexpected = self.load_state_dict(new_ckpt, strict=False, assign=True)
-            self.logger.info(
-                f'Restored from {pretrained_model} with {len(missing)} missing and {len(unexpected)} unexpected keys'
-            )
-            if len(missing) > 0:
-                self.logger.info(f'Missing Keys:\n {missing}')
-            if len(unexpected) > 0:
-                self.logger.info(f'\nUnexpected Keys:\n {unexpected}')
-    def forward(
-        self,
-        x: Tensor,
-        t: Tensor,
-        cond: dict = {},
-        guidance: Tensor | None = None,
-        gc_seg: int = 0
-    ) -> Tensor:
-        x, x_ids, txt, txt_ids, y, h, w = self.prepare_input(x, cond["context"], cond["y"])
-        # running on sequences img
-        x = self.img_in(x)
-        vec = self.time_in(timestep_embedding(t, 256))
-        if self.guidance_embed:
-            if guidance is None:
-                raise ValueError("Didn't get guidance strength for guidance distilled model.")
-            vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
-        vec = vec + self.vector_in(y)
-        txt = self.txt_in(txt)
-        ids = torch.cat((txt_ids, x_ids), dim=1)
-        pe = self.pe_embedder(ids)
-        kwargs = dict(
-            vec=vec,
-            pe=pe,
-            txt_length=txt.shape[1],
-        )
-        x = torch.cat((txt, x), 1)
-        if self.use_grad_checkpoint and gc_seg >= 0:
-            x = checkpoint_sequential(
-                functions=[partial(block, **kwargs) for block in self.double_blocks],
-                segments=gc_seg if gc_seg > 0 else len(self.double_blocks),
-                input=x,
-                use_reentrant=False
-            )
-        else:
-            for block in self.double_blocks:
-                x = block(x, **kwargs)
-        kwargs = dict(
-            vec=vec,
-            pe=pe,
-        )
-        if self.use_grad_checkpoint and gc_seg >= 0:
-            x = checkpoint_sequential(
-                functions=[partial(block, **kwargs) for block in self.single_blocks],
-                segments=gc_seg if gc_seg > 0 else len(self.single_blocks),
-                input=x,
-                use_reentrant=False
-            )
-        else:
-            for block in self.single_blocks:
-                x = block(x, **kwargs)
-        x = x[:, txt.shape[1] :, ...]
-        x = self.final_layer(x, vec)  # (N, T, patch_size ** 2 * out_channels) 6 64 64
-        x = self.unpack(x, h, w)
-        return x
-    @staticmethod
-    def get_config_template():
-        return dict_to_yaml('MODEL',
-                            __class__.__name__,
-                            Flux.para_dict,
-                            set_name=True)
-@BACKBONES.register_class()
-class ACEFlux(Flux):
-    '''
-        cat[x_seq, edit_seq]
-        pe[x_seq] pe[edit_seq]
-    '''
-    def __init__(
-            self,
-            cfg,
-            logger=None
-    ):
-        super().__init__(cfg, logger=logger)
-        self.in_channels = cfg.IN_CHANNELS
-        self.out_channels = cfg.get("OUT_CHANNELS", self.in_channels)
-        hidden_size = cfg.get("HIDDEN_SIZE", 1024)
-        num_heads = cfg.get("NUM_HEADS", 16)
-        axes_dim = cfg.AXES_DIM
-        theta = cfg.THETA
-        vec_in_dim = cfg.VEC_IN_DIM
-        self.guidance_embed = cfg.GUIDANCE_EMBED
-        context_in_dim = cfg.CONTEXT_IN_DIM
-        mlp_ratio = cfg.MLP_RATIO
-        qkv_bias = cfg.QKV_BIAS
-        depth = cfg.DEPTH
-        depth_single_blocks = cfg.DEPTH_SINGLE_BLOCKS
-        self.use_grad_checkpoint = cfg.get("USE_GRAD_CHECKPOINT", False)
-        self.attn_backend = cfg.get("ATTN_BACKEND", "pytorch")
-        self.lora_model = cfg.get("DIFFUSERS_LORA_MODEL", None)
-        self.swift_lora_model = cfg.get("SWIFT_LORA_MODEL", None)
-        self.blackforest_lora_model = cfg.get("BLACKFOREST_LORA_MODEL", None)
-        self.pretrain_adapter = cfg.get("PRETRAIN_ADAPTER", None)
-        if hidden_size % num_heads != 0:
-            raise ValueError(
-                f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}"
-            )
-        pe_dim = hidden_size // num_heads
-        if sum(axes_dim) != pe_dim:
-            raise ValueError(f"Got {axes_dim} but expected positional dim {pe_dim}")
-        self.hidden_size = hidden_size
-        self.num_heads = num_heads
-        self.pe_embedder = EmbedND(dim=pe_dim, theta=theta, axes_dim=axes_dim)
-        self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
-        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
-        self.vector_in = MLPEmbedder(vec_in_dim, self.hidden_size)
-        self.guidance_in = (
-            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if self.guidance_embed else nn.Identity()
-        )
-        self.txt_in = nn.Linear(context_in_dim, self.hidden_size)
-        self.double_blocks = nn.ModuleList(
-            [
-                DoubleStreamBlockACE(
-                    self.hidden_size,
-                    self.num_heads,
-                    mlp_ratio=mlp_ratio,
-                    qkv_bias=qkv_bias,
-                    backend=self.attn_backend
-                )
-                for _ in range(depth)
-            ]
-        )
-        self.single_blocks = nn.ModuleList(
-            [
-                SingleStreamBlockACE(self.hidden_size, self.num_heads, mlp_ratio=mlp_ratio, backend=self.attn_backend)
-                for _ in range(depth_single_blocks)
-            ]
-        )
-        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
-    def prepare_input(self, x, cond, *args, **kwargs):
-        context, y = cond["context"], cond["y"]
-        # import pdb;pdb.set_trace()
-        batch_shift = []
-        x_list, x_id_list, mask_x_list, x_seq_length = [], [], [], []
-        for ix, shape, is_align in zip(x, cond["x_shapes"], cond['align']):
-            # unpack image from sequence
-            ix = ix[:, :shape[0] * shape[1]].view(-1, shape[0], shape[1])
-            c, h, w = ix.shape
-            ix = rearrange(ix, "c (h ph) (w pw) -> (h w) (c ph pw)", ph=2, pw=2)
-            ix_id = torch.zeros(h // 2, w // 2, 3)
-            ix_id[..., 1] = ix_id[..., 1] + torch.arange(h // 2)[:, None]
-            ix_id[..., 2] = ix_id[..., 2] + torch.arange(w // 2)[None, :]
-            batch_shift.append(w // 2) if is_align < 1 else batch_shift.append(0)
-            ix_id = rearrange(ix_id, "h w c -> (h w) c")
-            ix = self.img_in(ix)
-            x_list.append(ix)
-            x_id_list.append(ix_id)
-            mask_x_list.append(torch.ones(ix.shape[0]).to(ix.device, non_blocking=True).bool())
-            x_seq_length.append(ix.shape[0])
-        x = pad_sequence(tuple(x_list), batch_first=True)
-        x_ids = pad_sequence(tuple(x_id_list), batch_first=True).to(x)  # [b,pad_seq,2] pad (0.,0.) at dim2
-        mask_x = pad_sequence(tuple(mask_x_list), batch_first=True)
-        if 'edit' in cond and sum(len(e) for e in cond['edit']) > 0:
-            batch_frames, batch_frames_ids = [], []
-            for i, edit in enumerate(cond['edit']):
-                batch_frames.append([])
-                batch_frames_ids.append([])
-                for ie in edit:
-                    ie = ie.squeeze(0)
-                    c, h, w = ie.shape
-                    ie = rearrange(ie, "c (h ph) (w pw) -> (h w) (c ph pw)", ph=2, pw=2)
-                    ie_id = torch.zeros(h // 2, w // 2, 3)
-                    ie_id[..., 1] = ie_id[..., 1] + torch.arange(h // 2)[:, None]
-                    ie_id[..., 2] = ie_id[..., 2] + torch.arange(batch_shift[i], batch_shift[i] + w // 2)[None, :]
-                    ie_id = rearrange(ie_id, "h w c -> (h w) c")
-                    batch_frames[i].append(ie)
-                    batch_frames_ids[i].append(ie_id)
-            edit_list, edit_id_list, edit_mask_x_list = [], [], []
-            for frames, frame_ids in zip(batch_frames, batch_frames_ids):
-                proj_frames = []
-                for idx, one_frame in enumerate(frames):
-                    one_frame = self.img_in(one_frame)
-                    proj_frames.append(one_frame)
-                ie = torch.cat(proj_frames, dim=0)
-                ie_id = torch.cat(frame_ids, dim=0)
-                edit_list.append(ie)
-                edit_id_list.append(ie_id)
-                edit_mask_x_list.append(torch.ones(ie.shape[0]).to(ie.device, non_blocking=True).bool())
-            edit = pad_sequence(tuple(edit_list), batch_first=True)
-            edit_ids = pad_sequence(tuple(edit_id_list), batch_first=True).to(x)  # [b,pad_seq,2] pad (0.,0.) at dim2
-            edit_mask_x = pad_sequence(tuple(edit_mask_x_list), batch_first=True)
-        else:
-            edit, edit_ids, edit_mask_x = None, None, None
-        txt_list, mask_txt_list, y_list = [], [], []
-        for sample_id, (ctx, yy) in enumerate(zip(context, y)):
-            txt_list.append(self.txt_in(ctx.to(x)))
-            mask_txt_list.append(torch.ones(txt_list[-1].shape[0]).to(ctx.device, non_blocking=True).bool())
-            y_list.append(yy.to(x))
-        txt = pad_sequence(tuple(txt_list), batch_first=True)
-        txt_ids = torch.zeros(txt.shape[0], txt.shape[1], 3).to(x)
-        mask_txt = pad_sequence(tuple(mask_txt_list), batch_first=True)
-        y = torch.cat(y_list, dim=0)
-        return x, x_ids, edit, edit_ids, txt, txt_ids, y, mask_x, edit_mask_x, mask_txt, x_seq_length
-    def unpack(self, x: Tensor, cond: dict = None, x_seq_length: list = None) -> Tensor:
-        x_list = []
-        image_shapes = cond["x_shapes"]
-        for u, shape, seq_length in zip(x, image_shapes, x_seq_length):
-            height, width = shape
-            h, w = math.ceil(height / 2), math.ceil(width / 2)
-            u = rearrange(
-                u[:h * w, ...],
-                "(h w) (c ph pw) -> (h ph w pw) c",
-                h=h,
-                w=w,
-                ph=2,
-                pw=2,
-            )
-            x_list.append(u)
-        x = pad_sequence(tuple(x_list), batch_first=True).permute(0, 2, 1)
-        return x
-    def forward(
-            self,
-            x: Tensor,
-            t: Tensor,
-            cond: dict = {},
-            guidance: Tensor | None = None,
-            gc_seg: int = 0,
-            **kwargs
-    ) -> Tensor:
-        x, x_ids, edit, edit_ids, txt, txt_ids, y, mask_x, edit_mask_x, mask_txt, seq_length_list = self.prepare_input(x, cond)
-        # running on sequences img
-        # condition use zero t
-        x_length = x.shape[1]
-        vec = self.time_in(timestep_embedding(t, 256))
-        if edit is not None:
-            edit_vec = self.time_in(timestep_embedding(t * 0, 256))
-            # print("edit_vec", torch.sum(edit_vec))
-        else:
-            edit_vec = None
-        if self.guidance_embed:
-            if guidance is None:
-                raise ValueError("Didn't get guidance strength for guidance distilled model.")
-            vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
-            if edit is not None:
-                edit_vec = edit_vec + self.guidance_in(timestep_embedding(guidance, 256))
-        vec = vec + self.vector_in(y)
-        if edit is not None:
-            edit_vec = edit_vec + self.vector_in(y)
-            ids = torch.cat((txt_ids, x_ids, edit_ids), dim=1)
-            mask_aside = torch.cat((mask_txt, mask_x, edit_mask_x), dim=1)
-            x = torch.cat((txt, x, edit), 1)
-        else:
-            ids = torch.cat((txt_ids, x_ids), dim=1)
-            mask_aside = torch.cat((mask_txt, mask_x), dim=1)
-            x = torch.cat((txt, x), 1)
-        pe = self.pe_embedder(ids)
-        mask = mask_aside[:, None, :] * mask_aside[:, :, None]
-        kwargs = dict(
-            vec=vec,
-            pe=pe,
-            mask=mask,
-            txt_length=txt.shape[1],
-            x_length=x_length,
-            edit_vec=edit_vec,
-        )
-        if self.use_grad_checkpoint and gc_seg >= 0:
-            x = checkpoint_sequential(
-                functions=[partial(block, **kwargs) for block in self.double_blocks],
-                segments=gc_seg if gc_seg > 0 else len(self.double_blocks),
-                input=x,
-                use_reentrant=False
-            )
-        else:
-            for idx, block in enumerate(self.double_blocks):
-                # print("double block", idx)
-                x = block(x, **kwargs)
-        if self.use_grad_checkpoint and gc_seg >= 0:
-            x = checkpoint_sequential(
-                functions=[partial(block, **kwargs) for block in self.single_blocks],
-                segments=gc_seg if gc_seg > 0 else len(self.single_blocks),
-                input=x,
-                use_reentrant=False
-            )
-        else:
-            for idx, block in enumerate(self.single_blocks):
-                # print("single block", idx)
-                x = block(x, **kwargs)
-        x = x[:, txt.shape[1]:txt.shape[1] + x_length, ...]
-        x = self.final_layer(x, vec)  # (N, T, patch_size ** 2 * out_channels) 6 64 64
-        x = self.unpack(x, cond, seq_length_list)
-        return x
-    @staticmethod
-    def get_config_template():
-        return dict_to_yaml('MODEL',
-                            __class__.__name__,
-                            ACEFlux.para_dict,
-                            set_name=True)

models/layers.py DELETED Viewed

@@ -1,497 +0,0 @@
-from __future__ import annotations
-import math
-from dataclasses import dataclass
-from torch import Tensor, nn
-import torch
-from einops import rearrange, repeat
-from torch import Tensor
-from torch.nn.utils.rnn import pad_sequence
-try:
-    from flash_attn import (
-        flash_attn_varlen_func
-    )
-    FLASHATTN_IS_AVAILABLE = True
-except ImportError:
-    FLASHATTN_IS_AVAILABLE = False
-    flash_attn_varlen_func = None
-def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask: Tensor | None = None, backend = 'pytorch') -> Tensor:
-    q, k = apply_rope(q, k, pe)
-    if backend == 'pytorch':
-        if mask is not None and mask.dtype == torch.bool:
-            mask = torch.zeros_like(mask).to(q).masked_fill_(mask.logical_not(), -1e20)
-        x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=mask)
-        # x = torch.nan_to_num(x, nan=0.0, posinf=1e10, neginf=-1e10)
-        x = rearrange(x, "B H L D -> B L (H D)")
-    elif backend == 'flash_attn':
-        # q: (B, H, L, D)
-        # k: (B, H, S, D) now L = S
-        # v: (B, H, S, D)
-        b, h, lq, d = q.shape
-        _, _, lk, _ = k.shape
-        q = rearrange(q, "B H L D -> B L H D")
-        k = rearrange(k, "B H S D -> B S H D")
-        v = rearrange(v, "B H S D -> B S H D")
-        if mask is None:
-            q_lens = torch.tensor([lq] * b, dtype=torch.int32).to(q.device, non_blocking=True)
-            k_lens = torch.tensor([lk] * b, dtype=torch.int32).to(k.device, non_blocking=True)
-        else:
-            q_lens = torch.sum(mask[:, 0, :, 0], dim=1).int()
-            k_lens = torch.sum(mask[:, 0, 0, :], dim=1).int()
-        q = torch.cat([q_v[:q_l] for q_v, q_l in zip(q, q_lens)])
-        k = torch.cat([k_v[:k_l] for k_v, k_l in zip(k, k_lens)])
-        v = torch.cat([v_v[:v_l] for v_v, v_l in zip(v, k_lens)])
-        cu_seqlens_q = torch.cat([q_lens.new_zeros([1]), q_lens]).cumsum(0, dtype=torch.int32)
-        cu_seqlens_k = torch.cat([k_lens.new_zeros([1]), k_lens]).cumsum(0, dtype=torch.int32)
-        max_seqlen_q = q_lens.max()
-        max_seqlen_k = k_lens.max()
-        x = flash_attn_varlen_func(
-            q,
-            k,
-            v,
-            cu_seqlens_q=cu_seqlens_q,
-            cu_seqlens_k=cu_seqlens_k,
-            max_seqlen_q=max_seqlen_q,
-            max_seqlen_k=max_seqlen_k
-        )
-        x_list = [x[cu_seqlens_q[i]:cu_seqlens_q[i+1]] for i in range(b)]
-        x = pad_sequence(tuple(x_list), batch_first=True)
-        x = rearrange(x, "B L H D -> B L (H D)")
-    else:
-        raise NotImplementedError
-    return x
-def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
-    assert dim % 2 == 0
-    scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
-    omega = 1.0 / (theta**scale)
-    out = torch.einsum("...n,d->...nd", pos, omega)
-    out = torch.stack([torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1)
-    out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2)
-    return out.float()
-def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor) -> tuple[Tensor, Tensor]:
-    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
-    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
-    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
-    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
-    return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
-class EmbedND(nn.Module):
-    def __init__(self, dim: int, theta: int, axes_dim: list[int]):
-        super().__init__()
-        self.dim = dim
-        self.theta = theta
-        self.axes_dim = axes_dim
-    def forward(self, ids: Tensor) -> Tensor:
-        n_axes = ids.shape[-1]
-        emb = torch.cat(
-            [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
-            dim=-3,
-        )
-        return emb.unsqueeze(1)
-def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0):
-    """
-    Create sinusoidal timestep embeddings.
-    :param t: a 1-D Tensor of N indices, one per batch element.
-                      These may be fractional.
-    :param dim: the dimension of the output.
-    :param max_period: controls the minimum frequency of the embeddings.
-    :return: an (N, D) Tensor of positional embeddings.
-    """
-    t = time_factor * t
-    half = dim // 2
-    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(
-        t.device
-    )
-    args = t[:, None].float() * freqs[None]
-    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
-    if dim % 2:
-        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
-    if torch.is_floating_point(t):
-        embedding = embedding.to(t)
-    return embedding
-class MLPEmbedder(nn.Module):
-    def __init__(self, in_dim: int, hidden_dim: int):
-        super().__init__()
-        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
-        self.silu = nn.SiLU()
-        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
-    def forward(self, x: Tensor) -> Tensor:
-        return self.out_layer(self.silu(self.in_layer(x)))
-class RMSNorm(torch.nn.Module):
-    def __init__(self, dim: int):
-        super().__init__()
-        self.scale = nn.Parameter(torch.ones(dim))
-    def forward(self, x: Tensor):
-        x_dtype = x.dtype
-        x = x.float()
-        rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
-        return (x * rrms).to(dtype=x_dtype) * self.scale
-class QKNorm(torch.nn.Module):
-    def __init__(self, dim: int):
-        super().__init__()
-        self.query_norm = RMSNorm(dim)
-        self.key_norm = RMSNorm(dim)
-    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple[Tensor, Tensor]:
-        q = self.query_norm(q)
-        k = self.key_norm(k)
-        return q.to(v), k.to(v)
-class SelfAttention(nn.Module):
-    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False):
-        super().__init__()
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-        self.norm = QKNorm(head_dim)
-        self.proj = nn.Linear(dim, dim)
-    def forward(self, x: Tensor, pe: Tensor, mask: Tensor | None = None) -> Tensor:
-        qkv = self.qkv(x)
-        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
-        q, k = self.norm(q, k, v)
-        x = attention(q, k, v, pe=pe, mask=mask)
-        x = self.proj(x)
-        return x
-class CrossAttention(nn.Module):
-    def __init__(self, dim: int, context_dim: int, num_heads: int = 8, qkv_bias: bool = False):
-        super().__init__()
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.q = nn.Linear(dim, dim, bias=qkv_bias)
-        self.kv = nn.Linear(dim, context_dim * 2, bias=qkv_bias)
-        self.norm = QKNorm(head_dim)
-        self.proj = nn.Linear(dim, dim)
-    def forward(self, x: Tensor, context: Tensor, pe: Tensor, mask: Tensor | None = None) -> Tensor:
-        qkv = self.qkv(x)
-        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
-        q, k = self.norm(q, k, v)
-        x = attention(q, k, v, pe=pe, mask=mask)
-        x = self.proj(x)
-        return x
-@dataclass
-class ModulationOut:
-    shift: Tensor
-    scale: Tensor
-    gate: Tensor
-class Modulation(nn.Module):
-    def __init__(self, dim: int, double: bool):
-        super().__init__()
-        self.is_double = double
-        self.multiplier = 6 if double else 3
-        self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
-    def forward(self, vec: Tensor) -> tuple[ModulationOut, ModulationOut | None]:
-        out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(self.multiplier, dim=-1)
-        return (
-            ModulationOut(*out[:3]),
-            ModulationOut(*out[3:]) if self.is_double else None,
-        )
-class DoubleStreamBlock(nn.Module):
-    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, backend = 'pytorch'):
-        super().__init__()
-        mlp_hidden_dim = int(hidden_size * mlp_ratio)
-        self.num_heads = num_heads
-        self.hidden_size = hidden_size
-        self.img_mod = Modulation(hidden_size, double=True)
-        self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
-        self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.img_mlp = nn.Sequential(
-            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
-            nn.GELU(approximate="tanh"),
-            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
-        )
-        self.backend = backend
-        self.txt_mod = Modulation(hidden_size, double=True)
-        self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
-        self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.txt_mlp = nn.Sequential(
-            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
-            nn.GELU(approximate="tanh"),
-            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
-        )
-    def forward(self, x: Tensor, vec: Tensor, pe: Tensor, mask: Tensor = None, txt_length = None):
-        img_mod1, img_mod2 = self.img_mod(vec)
-        txt_mod1, txt_mod2 = self.txt_mod(vec)
-        txt, img = x[:, :txt_length], x[:, txt_length:]
-        # prepare image for attention
-        img_modulated = self.img_norm1(img)
-        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
-        img_qkv = self.img_attn.qkv(img_modulated)
-        img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
-        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
-        # prepare txt for attention
-        txt_modulated = self.txt_norm1(txt)
-        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
-        txt_qkv = self.txt_attn.qkv(txt_modulated)
-        txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
-        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
-        # run actual attention
-        q = torch.cat((txt_q, img_q), dim=2)
-        k = torch.cat((txt_k, img_k), dim=2)
-        v = torch.cat((txt_v, img_v), dim=2)
-        if mask is not None:
-            mask = repeat(mask, 'B L S->  B H L S', H=self.num_heads)
-        attn = attention(q, k, v, pe=pe, mask = mask, backend = self.backend)
-        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
-        # calculate the img bloks
-        img = img + img_mod1.gate * self.img_attn.proj(img_attn)
-        img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)
-        # calculate the txt bloks
-        txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
-        txt = txt + txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
-        x = torch.cat((txt, img), 1)
-        return x
-class SingleStreamBlock(nn.Module):
-    """
-    A DiT block with parallel linear layers as described in
-    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
-    """
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        mlp_ratio: float = 4.0,
-        qk_scale: float | None = None,
-        backend='pytorch'
-    ):
-        super().__init__()
-        self.hidden_dim = hidden_size
-        self.num_heads = num_heads
-        head_dim = hidden_size // num_heads
-        self.scale = qk_scale or head_dim**-0.5
-        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
-        # qkv and mlp_in
-        self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
-        # proj and mlp_out
-        self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
-        self.norm = QKNorm(head_dim)
-        self.hidden_size = hidden_size
-        self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.mlp_act = nn.GELU(approximate="tanh")
-        self.modulation = Modulation(hidden_size, double=False)
-        self.backend = backend
-    def forward(self, x: Tensor, vec: Tensor, pe: Tensor, mask: Tensor = None) -> Tensor:
-        mod, _ = self.modulation(vec)
-        x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
-        qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
-        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
-        q, k = self.norm(q, k, v)
-        if mask is not None:
-            mask = repeat(mask, 'B L S->  B H L S', H=self.num_heads)
-        # compute attention
-        attn = attention(q, k, v, pe=pe, mask = mask, backend=self.backend)
-        # compute activation in mlp stream, cat again and run second linear layer
-        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
-        return x + mod.gate * output
-class DoubleStreamBlockACE(DoubleStreamBlock):
-    def forward(self,
-                x: Tensor,
-                vec: Tensor,
-                pe: Tensor,
-                edit_vec: Tensor | None = None,
-                mask: Tensor = None,
-                txt_length = None,
-                x_length = None):
-        img_mod1, img_mod2 = self.img_mod(vec)
-        txt_mod1, txt_mod2 = self.txt_mod(vec)
-        if edit_vec is not None:
-            edit_mod1, edit_mod2 = self.img_mod(edit_vec)
-            txt, img, edit = x[:, :txt_length], x[:, txt_length:txt_length+x_length], x[:, txt_length+x_length:]
-        else:
-            edit_mod1, edit_mod2 = None, None
-            txt, img = x[:, :txt_length], x[:, txt_length:]
-            edit = None
-        # prepare image for attention
-        img_modulated = self.img_norm1(img)
-        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
-        img_qkv = self.img_attn.qkv(img_modulated)
-        img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
-        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
-        # prepare txt for attention
-        txt_modulated = self.txt_norm1(txt)
-        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
-        txt_qkv = self.txt_attn.qkv(txt_modulated)
-        txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
-        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
-        # prepare edit for attention
-        if edit_vec is not None:
-            edit_modulated = self.img_norm1(edit)
-            edit_modulated = (1 + edit_mod1.scale) * edit_modulated + edit_mod1.shift
-            edit_qkv = self.img_attn.qkv(edit_modulated)
-            edit_q, edit_k, edit_v = rearrange(edit_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
-            edit_q, edit_k = self.img_attn.norm(edit_q, edit_k, edit_v)
-            q = torch.cat((txt_q, img_q, edit_q), dim=2)
-            k = torch.cat((txt_k, img_k, edit_k), dim=2)
-            v = torch.cat((txt_v, img_v, edit_v), dim=2)
-        else:
-            q = torch.cat((txt_q, img_q), dim=2)
-            k = torch.cat((txt_k, img_k), dim=2)
-            v = torch.cat((txt_v, img_v), dim=2)
-        # run actual attention
-        if mask is not None:
-            mask = repeat(mask, 'B L S->  B H L S', H=self.num_heads)
-        attn = attention(q, k, v, pe=pe, mask = mask, backend = "pytorch")
-        if edit_vec is not None:
-            txt_attn, img_attn, edit_attn = (attn[:, : txt.shape[1]],
-                                             attn[:, txt.shape[1] : txt.shape[1]+img.shape[1]],
-                                             attn[:, txt.shape[1]+img.shape[1]:])
-            # calculate the img bloks
-            img = img + img_mod1.gate * self.img_attn.proj(img_attn)
-            img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)
-            # calculate the img bloks
-            edit = edit + edit_mod1.gate * self.img_attn.proj(edit_attn)
-            edit = edit + edit_mod2.gate * self.img_mlp((1 + edit_mod2.scale) * self.img_norm2(edit) + edit_mod2.shift)
-            # calculate the txt bloks
-            txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
-            txt = txt + txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
-            x = torch.cat((txt, img, edit), 1)
-        else:
-            txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
-            # calculate the img bloks
-            img = img + img_mod1.gate * self.img_attn.proj(img_attn)
-            img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)
-            # calculate the txt bloks
-            txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
-            txt = txt + txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
-            x = torch.cat((txt, img), 1)
-        return x
-class SingleStreamBlockACE(SingleStreamBlock):
-    """
-    A DiT block with parallel linear layers as described in
-    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
-    """
-    def forward(self, x: Tensor, vec: Tensor,
-                pe: Tensor, mask: Tensor = None,
-                edit_vec: Tensor | None = None,
-                txt_length=None,
-                x_length=None
-                ) -> Tensor:
-        mod, _ = self.modulation(vec)
-        if edit_vec is not None:
-            x, edit = x[:, :txt_length + x_length], x[:, txt_length + x_length:]
-            e_mod, _ = self.modulation(edit_vec)
-            edit_mod = (1 + e_mod.scale) * self.pre_norm(edit) + e_mod.shift
-            edit_qkv, edit_mlp = torch.split(self.linear1(edit_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
-            x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
-            qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
-            qkv, mlp = torch.cat([qkv, edit_qkv], 1), torch.cat([mlp, edit_mlp], 1)
-        else:
-            x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
-            qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
-        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
-        q, k = self.norm(q, k, v)
-        if mask is not None:
-            mask = repeat(mask, 'B L S->  B H L S', H=self.num_heads)
-        # compute attention
-        attn = attention(q, k, v, pe=pe, mask = mask, backend="pytorch")
-        # compute activation in mlp stream, cat again and run second linear layer
-        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
-        if edit_vec is not None:
-            x_output, edit_output = output.split([x.shape[1], edit.shape[1]], dim = 1)
-            x = x + mod.gate * x_output
-            edit = edit + e_mod.gate * edit_output
-            x = torch.cat((x, edit), 1)
-            return x
-        else:
-            return x + mod.gate * output
-class LastLayer(nn.Module):
-    def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
-        super().__init__()
-        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
-        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True))
-    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
-        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
-        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
-        x = self.linear(x)
-        return x
-if __name__ == '__main__':
-    pe = EmbedND(dim=64, theta=10000, axes_dim=[16, 56, 56])
-    ix_id = torch.zeros(64 // 2, 64 // 2, 3)
-    ix_id[..., 1] = ix_id[..., 1] + torch.arange(64 // 2)[:, None]
-    ix_id[..., 2] = ix_id[..., 2] + torch.arange(64 // 2)[None, :]
-    ix_id = rearrange(ix_id, "h w c -> 1 (h w) c")
-    pos = torch.cat([ix_id, ix_id], dim = 1)
-    a = pe(pos)
-    b = torch.cat([pe(ix_id), pe(ix_id)], dim = 2)
-    print(a - b)

requirements.txt DELETED Viewed

@@ -1,8 +0,0 @@
-huggingface_hub
-diffusers
-transformers
-gradio_imageslider
-torch==2.4.0
-xformers==0.0.27.post2
-torchvision
-gradio==4.44.1

utils.py DELETED Viewed

@@ -1,95 +0,0 @@
-#copyright (c) Alibaba, Inc. and its affiliates.
-import torch
-import torchvision.transforms as T
-from PIL import Image
-from torchvision.transforms.functional import InterpolationMode
-IMAGENET_MEAN = (0.485, 0.456, 0.406)
-IMAGENET_STD = (0.229, 0.224, 0.225)
-def build_transform(input_size):
-    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
-    transform = T.Compose([
-        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
-        T.Resize((input_size, input_size),
-                 interpolation=InterpolationMode.BICUBIC),
-        T.ToTensor(),
-        T.Normalize(mean=MEAN, std=STD)
-    ])
-    return transform
-def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height,
-                              image_size):
-    best_ratio_diff = float('inf')
-    best_ratio = (1, 1)
-    area = width * height
-    for ratio in target_ratios:
-        target_aspect_ratio = ratio[0] / ratio[1]
-        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
-        if ratio_diff < best_ratio_diff:
-            best_ratio_diff = ratio_diff
-            best_ratio = ratio
-        elif ratio_diff == best_ratio_diff:
-            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
-                best_ratio = ratio
-    return best_ratio
-def dynamic_preprocess(image,
-                       min_num=1,
-                       max_num=12,
-                       image_size=448,
-                       use_thumbnail=False):
-    orig_width, orig_height = image.size
-    aspect_ratio = orig_width / orig_height
-    # calculate the existing image aspect ratio
-    target_ratios = set((i, j) for n in range(min_num, max_num + 1)
-                        for i in range(1, n + 1) for j in range(1, n + 1)
-                        if i * j <= max_num and i * j >= min_num)
-    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
-    # find the closest aspect ratio to the target
-    target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio,
-                                                    target_ratios, orig_width,
-                                                    orig_height, image_size)
-    # calculate the target width and height
-    target_width = image_size * target_aspect_ratio[0]
-    target_height = image_size * target_aspect_ratio[1]
-    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
-    # resize the image
-    resized_img = image.resize((target_width, target_height))
-    processed_images = []
-    for i in range(blocks):
-        box = ((i % (target_width // image_size)) * image_size,
-               (i // (target_width // image_size)) * image_size,
-               ((i % (target_width // image_size)) + 1) * image_size,
-               ((i // (target_width // image_size)) + 1) * image_size)
-        # split the image
-        split_img = resized_img.crop(box)
-        processed_images.append(split_img)
-    assert len(processed_images) == blocks
-    if use_thumbnail and len(processed_images) != 1:
-        thumbnail_img = image.resize((image_size, image_size))
-        processed_images.append(thumbnail_img)
-    return processed_images
-def load_image(image_file, input_size=448, max_num=12):
-    if isinstance(image_file, str):
-        image = Image.open(image_file).convert('RGB')
-    else:
-        image = image_file
-    transform = build_transform(input_size=input_size)
-    images = dynamic_preprocess(image,
-                                image_size=input_size,
-                                use_thumbnail=True,
-                                max_num=max_num)
-    pixel_values = [transform(image) for image in images]
-    pixel_values = torch.stack(pixel_values)
-    return pixel_values