ACE-Plus

Running on Zero

App Files Files Community

chaojiemao commited on Dec 7, 2024

Commit

154c805

1 Parent(s): 86c729f

modify app.py

Browse files

Files changed (6) hide show

ace_inference.py +356 -0
example.py +370 -0
model/__init__.py +1 -0
model/flux.py +1064 -0
model/layers.py +356 -0
utils.py +95 -0

ace_inference.py ADDED Viewed

	@@ -0,0 +1,356 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import copy
+import math
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.transforms.functional as TF
+from PIL import Image
+import torchvision.transforms as T
+from scepter.modules.model.registry import DIFFUSIONS
+from scepter.modules.model.utils.basic_utils import check_list_of_list
+from scepter.modules.model.utils.basic_utils import \
+    pack_imagelist_into_tensor_v2 as pack_imagelist_into_tensor
+from scepter.modules.model.utils.basic_utils import (
+    to_device, unpack_tensor_into_imagelist)
+from scepter.modules.utils.distribute import we
+from scepter.modules.utils.logger import get_logger
+from scepter.modules.inference.diffusion_inference import DiffusionInference, get_model
+def process_edit_image(images,
+                       masks,
+                       tasks,
+                       max_seq_len=1024,
+                       max_aspect_ratio=4,
+                       d=16,
+                       **kwargs):
+    if not isinstance(images, list):
+        images = [images]
+    if not isinstance(masks, list):
+        masks = [masks]
+    if not isinstance(tasks, list):
+        tasks = [tasks]
+    img_tensors = []
+    mask_tensors = []
+    for img, mask, task in zip(images, masks, tasks):
+        if mask is None or mask == '':
+            mask = Image.new('L', img.size, 0)
+        W, H = img.size
+        if H / W > max_aspect_ratio:
+            img = TF.center_crop(img, [int(max_aspect_ratio * W), W])
+            mask = TF.center_crop(mask, [int(max_aspect_ratio * W), W])
+        elif W / H > max_aspect_ratio:
+            img = TF.center_crop(img, [H, int(max_aspect_ratio * H)])
+            mask = TF.center_crop(mask, [H, int(max_aspect_ratio * H)])
+        H, W = img.height, img.width
+        scale = min(1.0, math.sqrt(max_seq_len / ((H / d) * (W / d))))
+        rH = int(H * scale) // d * d  # ensure divisible by self.d
+        rW = int(W * scale) // d * d
+        img = TF.resize(img, (rH, rW),
+                        interpolation=TF.InterpolationMode.BICUBIC)
+        mask = TF.resize(mask, (rH, rW),
+                         interpolation=TF.InterpolationMode.NEAREST_EXACT)
+        mask = np.asarray(mask)
+        mask = np.where(mask > 128, 1, 0)
+        mask = mask.astype(
+            np.float32) if np.any(mask) else np.ones_like(mask).astype(
+                np.float32)
+        img_tensor = TF.to_tensor(img).to(we.device_id)
+        img_tensor = TF.normalize(img_tensor,
+                                  mean=[0.5, 0.5, 0.5],
+                                  std=[0.5, 0.5, 0.5])
+        mask_tensor = TF.to_tensor(mask).to(we.device_id)
+        if task in ['inpainting', 'Try On', 'Inpainting']:
+            mask_indicator = mask_tensor.repeat(3, 1, 1)
+            img_tensor[mask_indicator == 1] = -1.0
+        img_tensors.append(img_tensor)
+        mask_tensors.append(mask_tensor)
+    return img_tensors, mask_tensors
+class TextEmbedding(nn.Module):
+    def __init__(self, embedding_shape):
+        super().__init__()
+        self.pos = nn.Parameter(data=torch.zeros(embedding_shape))
+class ACEFluxLCInference(DiffusionInference):
+    def __init__(self, logger=None):
+        if logger is None:
+            logger = get_logger(name='scepter')
+        self.logger = logger
+        self.loaded_model = {}
+        self.loaded_model_name = [
+            'diffusion_model', 'first_stage_model', 'cond_stage_model', 'ref_cond_stage_model'
+        ]
+    def init_from_cfg(self, cfg):
+        self.name = cfg.NAME
+        self.is_default = cfg.get('IS_DEFAULT', False)
+        self.use_dynamic_model = cfg.get('USE_DYNAMIC_MODEL', True)
+        module_paras = self.load_default(cfg.get('DEFAULT_PARAS', None))
+        assert cfg.have('MODEL')
+        self.size_factor = cfg.get('SIZE_FACTOR', 8)
+        self.diffusion_model = self.infer_model(
+            cfg.MODEL.DIFFUSION_MODEL, module_paras.get(
+                'DIFFUSION_MODEL',
+                None)) if cfg.MODEL.have('DIFFUSION_MODEL') else None
+        self.first_stage_model = self.infer_model(
+            cfg.MODEL.FIRST_STAGE_MODEL,
+            module_paras.get(
+                'FIRST_STAGE_MODEL',
+                None)) if cfg.MODEL.have('FIRST_STAGE_MODEL') else None
+        self.cond_stage_model = self.infer_model(
+            cfg.MODEL.COND_STAGE_MODEL,
+            module_paras.get(
+                'COND_STAGE_MODEL',
+                None)) if cfg.MODEL.have('COND_STAGE_MODEL') else None
+        self.ref_cond_stage_model = self.infer_model(
+            cfg.MODEL.REF_COND_STAGE_MODEL,
+            module_paras.get(
+                'REF_COND_STAGE_MODEL',
+                None)) if cfg.MODEL.have('REF_COND_STAGE_MODEL') else None
+        self.diffusion = DIFFUSIONS.build(cfg.MODEL.DIFFUSION,
+                                          logger=self.logger)
+        self.interpolate_func = lambda x: (F.interpolate(
+            x.unsqueeze(0),
+            scale_factor=1 / self.size_factor,
+            mode='nearest-exact') if x is not None else None)
+        self.max_seq_length = cfg.get("MAX_SEQ_LENGTH", 4096)
+        self.src_max_seq_length = cfg.get("SRC_MAX_SEQ_LENGTH", 1024)
+        self.image_token = cfg.MODEL.get("IMAGE_TOKEN", "<img>")
+        self.text_indentifers = cfg.MODEL.get('TEXT_IDENTIFIER', [])
+        self.use_text_pos_embeddings = cfg.MODEL.get('USE_TEXT_POS_EMBEDDINGS',
+                                                     False)
+        if self.use_text_pos_embeddings:
+            self.text_position_embeddings = TextEmbedding(
+                (10, 4096)).eval().requires_grad_(False).to(we.device_id)
+        else:
+            self.text_position_embeddings = None
+        if not self.use_dynamic_model:
+            self.dynamic_load(self.first_stage_model, 'first_stage_model')
+            self.dynamic_load(self.cond_stage_model, 'cond_stage_model')
+            if self.ref_cond_stage_model is not None: self.dynamic_load(self.ref_cond_stage_model, 'ref_cond_stage_model')
+            self.dynamic_load(self.diffusion_model, 'diffusion_model')
+    def upscale_resize(self, image, interpolation=T.InterpolationMode.BILINEAR):
+        c, H, W = image.shape
+        scale = max(1.0, math.sqrt(self.max_seq_length / ((H / 16) * (W / 16))))
+        rH = int(H * scale) // 16 * 16  # ensure divisible by self.d
+        rW = int(W * scale) // 16 * 16
+        image = T.Resize((rH, rW), interpolation=interpolation, antialias=True)(image)
+        return image
+    @torch.no_grad()
+    def encode_first_stage(self, x, **kwargs):
+        _, dtype = self.get_function_info(self.first_stage_model, 'encode')
+        with torch.autocast('cuda',
+                            enabled=dtype in ('float16', 'bfloat16'),
+                            dtype=getattr(torch, dtype)):
+            def run_one_image(u):
+                zu = get_model(self.first_stage_model).encode(u)
+                if isinstance(zu, (tuple, list)):
+                    zu = zu[0]
+                return zu
+            z = [run_one_image(u.unsqueeze(0) if u.dim() == 3 else u) for u in x]
+            return z
+    @torch.no_grad()
+    def decode_first_stage(self, z):
+        _, dtype = self.get_function_info(self.first_stage_model, 'decode')
+        with torch.autocast('cuda',
+                            enabled=dtype in ('float16', 'bfloat16'),
+                            dtype=getattr(torch, dtype)):
+            return [get_model(self.first_stage_model).decode(zu) for zu in z]
+    def noise_sample(self, num_samples, h, w, seed, device = None, dtype = torch.bfloat16):
+        noise = torch.randn(
+            num_samples,
+            16,
+            # allow for packing
+            2 * math.ceil(h / 16),
+            2 * math.ceil(w / 16),
+            device=device,
+            dtype=dtype,
+            generator=torch.Generator(device=device).manual_seed(seed),
+        )
+        return noise
+    # def preprocess_prompt(self, prompt):
+    #     prompt_ = [[pp] if isinstance(pp, str) else pp for pp in prompt]
+    #     for pp_id, pp in enumerate(prompt_):
+    #         prompt_[pp_id] = [""] + pp
+    #         for p_id, p in enumerate(prompt_[pp_id]):
+    #             prompt_[pp_id][p_id] = self.image_token + self.text_indentifers[p_id] + " " + p
+    #         prompt_[pp_id] = [f";".join(prompt_[pp_id])]
+    #     return prompt_
+    @torch.no_grad()
+    def __call__(self,
+                 image=None,
+                 mask=None,
+                 prompt='',
+                 task=None,
+                 negative_prompt='',
+                 output_height=1024,
+                 output_width=1024,
+                 sampler='flow_euler',
+                 sample_steps=20,
+                 guide_scale=3.5,
+                 seed=-1,
+                 history_io=None,
+                 tar_index=0,
+                 align=0,
+                 **kwargs):
+        input_image, input_mask = image, mask
+        seed = seed if seed >= 0 else random.randint(0, 2**32 - 1)
+        if input_image is not None:
+            # assert isinstance(input_image, list) and isinstance(input_mask, list)
+            if task is None:
+                task = [''] * len(input_image)
+            if not isinstance(prompt, list):
+                prompt = [prompt] * len(input_image)
+            prompt = [
+                pp.replace('{image}', f'{{image{i}}}') if i > 0 else pp
+                for i, pp in enumerate(prompt)
+            ]
+            edit_image, edit_image_mask = process_edit_image(
+                input_image, input_mask, task, max_seq_len=self.src_max_seq_length)
+            image, image_mask = self.upscale_resize(edit_image[tar_index]), self.upscale_resize(edit_image_mask[
+               tar_index])
+            # edit_image, edit_image_mask = [[self.upscale_resize(i) for i in edit_image]], [[self.upscale_resize(i) for i in edit_image_mask]]
+            # image, image_mask = edit_image[tar_index], edit_image_mask[tar_index]
+            edit_image, edit_image_mask = [edit_image], [edit_image_mask]
+        else:
+            edit_image = edit_image_mask = [[]]
+            image = torch.zeros(
+                size=[3, int(output_height),
+                      int(output_width)])
+            image_mask = torch.ones(
+                size=[1, int(output_height),
+                      int(output_width)])
+            if not isinstance(prompt, list):
+                prompt = [prompt]
+        image, image_mask, prompt = [image], [image_mask], [prompt],
+        align = [align for p in prompt] if isinstance(align, int) else align
+        assert check_list_of_list(prompt) and check_list_of_list(
+            edit_image) and check_list_of_list(edit_image_mask)
+        # negative prompt is not used
+        image = to_device(image)
+        ctx = {}
+        # Get Noise Shape
+        self.dynamic_load(self.first_stage_model, 'first_stage_model')
+        x = self.encode_first_stage(image)
+        self.dynamic_unload(self.first_stage_model,
+                            'first_stage_model',
+                            skip_loaded=not self.use_dynamic_model)
+        g = torch.Generator(device=we.device_id).manual_seed(seed)
+        noise = [
+            torch.randn((1, 16, i.shape[2], i.shape[3]), device=we.device_id, dtype=torch.bfloat16).normal_(generator=g)
+            for i in x
+        ]
+        noise, x_shapes = pack_imagelist_into_tensor(noise)
+        ctx['x_shapes'] = x_shapes
+        ctx['align'] = align
+        image_mask = to_device(image_mask, strict=False)
+        cond_mask = [self.interpolate_func(i) for i in image_mask
+                     ] if image_mask is not None else [None] * len(image)
+        ctx['x_mask'] = cond_mask
+        # Encode Prompt
+        instruction_prompt = [[pp[-1]] if "{image}" in pp[-1] else ["{image} " + pp[-1]] for pp in prompt]
+        self.dynamic_load(self.cond_stage_model, 'cond_stage_model')
+        function_name, dtype = self.get_function_info(self.cond_stage_model)
+        cont = getattr(get_model(self.cond_stage_model), function_name)(instruction_prompt)
+        cont["context"] = [ct[-1] for ct in cont["context"]]
+        cont["y"] = [ct[-1] for ct in cont["y"]]
+        self.dynamic_unload(self.cond_stage_model,
+                            'cond_stage_model',
+                            skip_loaded=not self.use_dynamic_model)
+        ctx.update(cont)
+        # Encode Edit Images
+        self.dynamic_load(self.first_stage_model, 'first_stage_model')
+        edit_image = [to_device(i, strict=False) for i in edit_image]
+        edit_image_mask = [to_device(i, strict=False) for i in edit_image_mask]
+        e_img, e_mask = [], []
+        for u, m in zip(edit_image, edit_image_mask):
+            if u is None:
+                continue
+            if m is None:
+                m = [None] * len(u)
+            e_img.append(self.encode_first_stage(u, **kwargs))
+            e_mask.append([self.interpolate_func(i) for i in m])
+        self.dynamic_unload(self.first_stage_model,
+                            'first_stage_model',
+                            skip_loaded=not self.use_dynamic_model)
+        ctx['edit_x'] = e_img
+        ctx['edit_mask'] = e_mask
+        # Encode Ref Images
+        if guide_scale is not None:
+            guide_scale = torch.full((noise.shape[0],), guide_scale, device=noise.device, dtype=noise.dtype)
+        else:
+            guide_scale = None
+        # Diffusion Process
+        self.dynamic_load(self.diffusion_model, 'diffusion_model')
+        function_name, dtype = self.get_function_info(self.diffusion_model)
+        with torch.autocast('cuda',
+                            enabled=dtype in ('float16', 'bfloat16'),
+                            dtype=getattr(torch, dtype)):
+            latent = self.diffusion.sample(
+                noise=noise,
+                sampler=sampler,
+                model=get_model(self.diffusion_model),
+                model_kwargs={
+                    "cond": ctx, "guidance": guide_scale, "gc_seg": -1
+                },
+                steps=sample_steps,
+                show_progress=True,
+                guide_scale=guide_scale,
+                return_intermediate=None,
+                reverse_scale=-1,
+                **kwargs).float()
+        if self.use_dynamic_model: self.dynamic_unload(self.diffusion_model,
+                            'diffusion_model',
+                            skip_loaded=not self.use_dynamic_model)
+        # Decode to Pixel Space
+        self.dynamic_load(self.first_stage_model, 'first_stage_model')
+        samples = unpack_tensor_into_imagelist(latent, x_shapes)
+        x_samples = self.decode_first_stage(samples)
+        self.dynamic_unload(self.first_stage_model,
+                            'first_stage_model',
+                            skip_loaded=not self.use_dynamic_model)
+        x_samples = [x.squeeze(0) for x in x_samples]
+        imgs = [
+            torch.clamp((x_i.float() + 1.0) / 2.0,
+                        min=0.0,
+                        max=1.0).squeeze(0).permute(1, 2, 0).cpu().numpy()
+            for x_i in x_samples
+        ]
+        imgs = [Image.fromarray((img * 255).astype(np.uint8)) for img in imgs]
+        return imgs

example.py ADDED Viewed

	@@ -0,0 +1,370 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from PIL import Image
+from scepter.modules.utils.file_system import FS
+def download_image(image, local_path=None):
+    if not FS.exists(local_path):
+        local_path = FS.get_from(image, local_path=local_path)
+        if local_path.split(".")[-1] in ['jpg', 'jpeg']:
+            im = Image.open(local_path).convert("RGB")
+            im.save(local_path, format='JPEG')
+    return local_path
+def get_examples(cache_dir):
+    print('Downloading Examples ...')
+    examples = [
+        [
+            'Facial Editing',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/e33edc106953.png?raw=true',
+                os.path.join(cache_dir, 'examples/e33edc106953.jpg')), None,
+            None, '{image} let the man smile', 6666
+        ],
+        [
+            'Facial Editing',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/5d2bcc91a3e9.png?raw=true',
+                os.path.join(cache_dir, 'examples/5d2bcc91a3e9.jpg')), None,
+            None, 'let the man in {image} wear sunglasses', 9999
+        ],
+        [
+            'Facial Editing',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/3a52eac708bd.png?raw=true',
+                os.path.join(cache_dir, 'examples/3a52eac708bd.jpg')), None,
+            None, '{image} red hair', 9999
+        ],
+        [
+            'Facial Editing',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/3f4dc464a0ea.png?raw=true',
+                os.path.join(cache_dir, 'examples/3f4dc464a0ea.jpg')), None,
+            None, '{image} let the man serious', 99999
+        ],
+        [
+            'Controllable Generation',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/131ca90fd2a9.png?raw=true',
+                os.path.join(cache_dir,
+                             'examples/131ca90fd2a9.jpg')), None, None,
+            '"A person sits contemplatively on the ground, surrounded by falling autumn leaves. Dressed in a green sweater and dark blue pants, they rest their chin on their hand, exuding a relaxed demeanor. Their stylish checkered slip-on shoes add a touch of flair, while a black purse lies in their lap. The backdrop of muted brown enhances the warm, cozy atmosphere of the scene." , generate the image that corresponds to the given scribble {image}.',
+            613725
+        ],
+        [
+            'Render Text',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/33e9f27c2c48.png?raw=true',
+                os.path.join(cache_dir, 'examples/33e9f27c2c48.jpg')),
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/33e9f27c2c48_mask.png?raw=true',
+                os.path.join(cache_dir,
+                             'examples/33e9f27c2c48_mask.jpg')), None,
+            'Put the text "C A T" at the position marked by mask in the {image}',
+            6666
+        ],
+        [
+            'Style Transfer',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/9e73e7eeef55.png?raw=true',
+                os.path.join(cache_dir, 'examples/9e73e7eeef55.jpg')), None,
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/2e02975293d6.png?raw=true',
+                os.path.join(cache_dir, 'examples/2e02975293d6.jpg')),
+            'edit {image} based on the style of {image1} ', 99999
+        ],
+        [
+            'Outpainting',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/f2b22c08be3f.png?raw=true',
+                os.path.join(cache_dir, 'examples/f2b22c08be3f.jpg')),
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/f2b22c08be3f_mask.png?raw=true',
+                os.path.join(cache_dir,
+                             'examples/f2b22c08be3f_mask.jpg')), None,
+            'Could the {image} be widened within the space designated by mask, while retaining the original?',
+            6666
+        ],
+        [
+            'Image Segmentation',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/db3ebaa81899.png?raw=true',
+                os.path.join(cache_dir, 'examples/db3ebaa81899.jpg')), None,
+            None, '{image} Segmentation', 6666
+        ],
+        [
+            'Depth Estimation',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/f1927c4692ba.png?raw=true',
+                os.path.join(cache_dir, 'examples/f1927c4692ba.jpg')), None,
+            None, '{image} Depth Estimation', 6666
+        ],
+        [
+            'Pose Estimation',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/014e5bf3b4d1.png?raw=true',
+                os.path.join(cache_dir, 'examples/014e5bf3b4d1.jpg')), None,
+            None, '{image} distinguish the poses of the figures', 999999
+        ],
+        [
+            'Scribble Extraction',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/5f59a202f8ac.png?raw=true',
+                os.path.join(cache_dir, 'examples/5f59a202f8ac.jpg')), None,
+            None, 'Generate a scribble of {image}, please.', 6666
+        ],
+        [
+            'Mosaic',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/3a2f52361eea.png?raw=true',
+                os.path.join(cache_dir, 'examples/3a2f52361eea.jpg')), None,
+            None, 'Adapt {image} into a mosaic representation.', 6666
+        ],
+        [
+            'Edge map Extraction',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/b9d1e519d6e5.png?raw=true',
+                os.path.join(cache_dir, 'examples/b9d1e519d6e5.jpg')), None,
+            None, 'Get the edge-enhanced result for {image}.', 6666
+        ],
+        [
+            'Grayscale',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/c4ebbe2ba29b.png?raw=true',
+                os.path.join(cache_dir, 'examples/c4ebbe2ba29b.jpg')), None,
+            None, 'transform {image} into a black and white one', 6666
+        ],
+        [
+            'Contour Extraction',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/19652d0f6c4b.png?raw=true',
+                os.path.join(cache_dir,
+                             'examples/19652d0f6c4b.jpg')), None, None,
+            'Would you be able to make a contour picture from {image} for me?',
+            6666
+        ],
+        [
+            'Controllable Generation',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/249cda2844b7.png?raw=true',
+                os.path.join(cache_dir,
+                             'examples/249cda2844b7.jpg')), None, None,
+            'Following the segmentation outcome in mask of {image}, develop a real-life image using the explanatory note in "a mighty cat lying on the bed”.',
+            6666
+        ],
+        [
+            'Controllable Generation',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/411f6c4b8e6c.png?raw=true',
+                os.path.join(cache_dir,
+                             'examples/411f6c4b8e6c.jpg')), None, None,
+            'use the depth map {image} and the text caption "a cut white cat" to create a corresponding graphic image',
+            999999
+        ],
+        [
+            'Controllable Generation',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/a35c96ed137a.png?raw=true',
+                os.path.join(cache_dir,
+                             'examples/a35c96ed137a.jpg')), None, None,
+            'help translate this posture schema {image} into a colored image based on the context I provided "A beautiful woman Climbing the climbing wall, wearing a harness and climbing gear, skillfully maneuvering up the wall with her back to the camera, with a safety rope."',
+            3599999
+        ],
+        [
+            'Controllable Generation',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/dcb2fc86f1ce.png?raw=true',
+                os.path.join(cache_dir,
+                             'examples/dcb2fc86f1ce.jpg')), None, None,
+            'Transform and generate an image using mosaic {image} and "Monarch butterflies gracefully perch on vibrant purple flowers, showcasing their striking orange and black wings in a lush garden setting." description',
+            6666
+        ],
+        [
+            'Controllable Generation',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/4cd4ee494962.png?raw=true',
+                os.path.join(cache_dir,
+                             'examples/4cd4ee494962.jpg')), None, None,
+            'make this {image} colorful as per the "beautiful sunflowers"',
+            6666
+        ],
+        [
+            'Controllable Generation',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/a47e3a9cd166.png?raw=true',
+                os.path.join(cache_dir,
+                             'examples/a47e3a9cd166.jpg')), None, None,
+            'Take the edge conscious {image} and the written guideline "A whimsical animated character is depicted holding a delectable cake adorned with blue and white frosting and a drizzle of chocolate. The character wears a yellow headband with a bow, matching a cozy yellow sweater. Her dark hair is styled in a braid, tied with a yellow ribbon. With a golden fork in hand, she stands ready to enjoy a slice, exuding an air of joyful anticipation. The scene is creatively rendered with a charming and playful aesthetic." and produce a realistic image.',
+            613725
+        ],
+        [
+            'Controllable Generation',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/d890ed8a3ac2.png?raw=true',
+                os.path.join(cache_dir,
+                             'examples/d890ed8a3ac2.jpg')), None, None,
+            'creating a vivid image based on {image} and description "This image features a delicious rectangular tart with a flaky, golden-brown crust. The tart is topped with evenly sliced tomatoes, layered over a creamy cheese filling. Aromatic herbs are sprinkled on top, adding a touch of green and enhancing the visual appeal. The background includes a soft, textured fabric and scattered white flowers, creating an elegant and inviting presentation. Bright red tomatoes in the upper right corner hint at the fresh ingredients used in the dish."',
+            6666
+        ],
+        [
+            'Image Denoising',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/0844a686a179.png?raw=true',
+                os.path.join(cache_dir,
+                             'examples/0844a686a179.jpg')), None, None,
+            'Eliminate noise interference in {image} and maximize the crispness to obtain superior high-definition quality',
+            6666
+        ],
+        [
+            'Inpainting',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/fa91b6b7e59b.png?raw=true',
+                os.path.join(cache_dir, 'examples/fa91b6b7e59b.jpg')),
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/fa91b6b7e59b_mask.png?raw=true',
+                os.path.join(cache_dir,
+                             'examples/fa91b6b7e59b_mask.jpg')), None,
+            'Ensure to overhaul the parts of the {image} indicated by the mask.',
+            6666
+        ],
+        [
+            'Inpainting',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/632899695b26.png?raw=true',
+                os.path.join(cache_dir, 'examples/632899695b26.jpg')),
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/632899695b26_mask.png?raw=true',
+                os.path.join(cache_dir,
+                             'examples/632899695b26_mask.jpg')), None,
+            'Refashion the mask portion of {image} in accordance with "A yellow egg with a smiling face painted on it"',
+            6666
+        ],
+        [
+            'General Editing',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/354d17594afe.png?raw=true',
+                os.path.join(cache_dir,
+                             'examples/354d17594afe.jpg')), None, None,
+            '{image} change the dog\'s posture to walking in the water, and change the background to green plants and a pond.',
+            6666
+        ],
+        [
+            'General Editing',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/38946455752b.png?raw=true',
+                os.path.join(cache_dir,
+                             'examples/38946455752b.jpg')), None, None,
+            '{image} change the color of the dress from white to red and the model\'s hair color red brown to blonde.Other parts remain unchanged',
+            6669
+        ],
+        [
+            'Facial Editing',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/3ba5202f0cd8.png?raw=true',
+                os.path.join(cache_dir,
+                             'examples/3ba5202f0cd8.jpg')), None, None,
+            'Keep the same facial feature in @3ba5202f0cd8, change the woman\'s clothing from a Blue denim jacket to a white turtleneck sweater and adjust her posture so that she is supporting her chin with both hands. Other aspects, such as background, hairstyle, facial expression, etc, remain unchanged.',
+            99999
+        ],
+        [
+            'Facial Editing',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/369365b94725.png?raw=true',
+                os.path.join(cache_dir, 'examples/369365b94725.jpg')), None,
+            None, '{image} Make her looking at the camera', 6666
+        ],
+        [
+            'Facial Editing',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/92751f2e4a0e.png?raw=true',
+                os.path.join(cache_dir, 'examples/92751f2e4a0e.jpg')), None,
+            None, '{image} Remove the smile from his face', 9899999
+        ],
+        [
+            'Remove Text',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/8530a6711b2e.png?raw=true',
+                os.path.join(cache_dir, 'examples/8530a6711b2e.jpg')), None,
+            None, 'Aim to remove any textual element in {image}', 6666
+        ],
+        [
+            'Remove Text',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/c4d7fb28f8f6.png?raw=true',
+                os.path.join(cache_dir, 'examples/c4d7fb28f8f6.jpg')),
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/c4d7fb28f8f6_mask.png?raw=true',
+                os.path.join(cache_dir,
+                             'examples/c4d7fb28f8f6_mask.jpg')), None,
+            'Rub out any text found in the mask sector of the {image}.', 6666
+        ],
+        [
+            'Remove Object',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/e2f318fa5e5b.png?raw=true',
+                os.path.join(cache_dir,
+                             'examples/e2f318fa5e5b.jpg')), None, None,
+            'Remove the unicorn in this {image}, ensuring a smooth edit.',
+            99999
+        ],
+        [
+            'Remove Object',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/1ae96d8aca00.png?raw=true',
+                os.path.join(cache_dir, 'examples/1ae96d8aca00.jpg')),
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/1ae96d8aca00_mask.png?raw=true',
+                os.path.join(cache_dir, 'examples/1ae96d8aca00_mask.jpg')),
+            None, 'Discard the contents of the mask area from {image}.', 99999
+        ],
+        [
+            'Add Object',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/80289f48e511.png?raw=true',
+                os.path.join(cache_dir, 'examples/80289f48e511.jpg')),
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/80289f48e511_mask.png?raw=true',
+                os.path.join(cache_dir,
+                             'examples/80289f48e511_mask.jpg')), None,
+            'add a Hot Air Balloon into the {image}, per the mask', 613725
+        ],
+        [
+            'Style Transfer',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/d725cb2009e8.png?raw=true',
+                os.path.join(cache_dir, 'examples/d725cb2009e8.jpg')), None,
+            None, 'Change the style of {image} to colored pencil style', 99999
+        ],
+        [
+            'Style Transfer',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/e0f48b3fd010.png?raw=true',
+                os.path.join(cache_dir, 'examples/e0f48b3fd010.jpg')), None,
+            None, 'make {image} to Walt Disney Animation style', 99999
+        ],
+        [
+            'Try On',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/ee4ca60b8c96.png?raw=true',
+                os.path.join(cache_dir, 'examples/ee4ca60b8c96.jpg')),
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/ee4ca60b8c96_mask.png?raw=true',
+                os.path.join(cache_dir, 'examples/ee4ca60b8c96_mask.jpg')),
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/ebe825bbfe3c.png?raw=true',
+                os.path.join(cache_dir, 'examples/ebe825bbfe3c.jpg')),
+            'Change the cloth in {image} to the one in {image1}', 99999
+        ],
+        [
+            'Workflow',
+            download_image(
+                'https://github.com/ali-vilab/ace-page/blob/main/assets/examples/cb85353c004b.png?raw=true',
+                os.path.join(cache_dir, 'examples/cb85353c004b.jpg')), None,
+            None, '<workflow> ice cream {image}', 99999
+        ],
+    ]
+    print('Finish. Start building UI ...')
+    return examples

model/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ #from .flux import Flux, FluxMR, FluxEdit

model/flux.py ADDED Viewed

	@@ -0,0 +1,1064 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import math
+from collections import OrderedDict
+from functools import partial
+import warnings
+from contextlib import nullcontext
+import torch
+from einops import rearrange, repeat
+from scepter.modules.model.base_model import BaseModel
+from scepter.modules.model.registry import BACKBONES
+from scepter.modules.utils.config import dict_to_yaml
+from scepter.modules.utils.distribute import we
+from scepter.modules.utils.file_system import FS
+from torch import Tensor, nn
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.checkpoint import checkpoint_sequential
+import torch.nn.functional as F
+import torch.utils.dlpack
+import transformers
+from scepter.modules.model.embedder.base_embedder import BaseEmbedder
+from scepter.modules.model.registry import EMBEDDERS
+from scepter.modules.model.tokenizer.tokenizer_component import (
+    basic_clean, canonicalize, heavy_clean, whitespace_clean)
+try:
+    from transformers import AutoTokenizer, T5EncoderModel
+except Exception as e:
+    warnings.warn(
+        f'Import transformers error, please deal with this problem: {e}')
+from .layers import (DoubleStreamBlock, EmbedND, LastLayer,
+                                 MLPEmbedder, SingleStreamBlock,
+                                 timestep_embedding)
+@EMBEDDERS.register_class()
+class ACETextEmbedder(BaseEmbedder):
+    """
+    Uses the OpenCLIP transformer encoder for text
+    """
+    """
+        Uses the OpenCLIP transformer encoder for text
+        """
+    para_dict = {
+        'PRETRAINED_MODEL': {
+            'value':
+            'google/umt5-small',
+            'description':
+            'Pretrained Model for umt5, modelcard path or local path.'
+        },
+        'TOKENIZER_PATH': {
+            'value': 'google/umt5-small',
+            'description':
+            'Tokenizer Path for umt5, modelcard path or local path.'
+        },
+        'FREEZE': {
+            'value': True,
+            'description': ''
+        },
+        'USE_GRAD': {
+            'value': False,
+            'description': 'Compute grad or not.'
+        },
+        'CLEAN': {
+            'value':
+            'whitespace',
+            'description':
+            'Set the clean strtegy for tokenizer, used when TOKENIZER_PATH is not None.'
+        },
+        'LAYER': {
+            'value': 'last',
+            'description': ''
+        },
+        'LEGACY': {
+            'value':
+            True,
+            'description':
+            'Whether use legacy returnd feature or not ,default True.'
+        }
+    }
+    def __init__(self, cfg, logger=None):
+        super().__init__(cfg, logger=logger)
+        pretrained_path = cfg.get('PRETRAINED_MODEL', None)
+        self.t5_dtype = cfg.get('T5_DTYPE', 'float32')
+        assert pretrained_path
+        with FS.get_dir_to_local_dir(pretrained_path,
+                                     wait_finish=True) as local_path:
+            self.model = T5EncoderModel.from_pretrained(
+                local_path,
+                torch_dtype=getattr(
+                    torch,
+                    'float' if self.t5_dtype == 'float32' else self.t5_dtype))
+        tokenizer_path = cfg.get('TOKENIZER_PATH', None)
+        self.length = cfg.get('LENGTH', 77)
+        self.use_grad = cfg.get('USE_GRAD', False)
+        self.clean = cfg.get('CLEAN', 'whitespace')
+        self.added_identifier = cfg.get('ADDED_IDENTIFIER', None)
+        if tokenizer_path:
+            self.tokenize_kargs = {'return_tensors': 'pt'}
+            with FS.get_dir_to_local_dir(tokenizer_path,
+                                         wait_finish=True) as local_path:
+                if self.added_identifier is not None and isinstance(
+                        self.added_identifier, list):
+                    self.tokenizer = AutoTokenizer.from_pretrained(local_path)
+                else:
+                    self.tokenizer = AutoTokenizer.from_pretrained(local_path)
+            if self.length is not None:
+                self.tokenize_kargs.update({
+                    'padding': 'max_length',
+                    'truncation': True,
+                    'max_length': self.length
+                })
+            self.eos_token = self.tokenizer(
+                self.tokenizer.eos_token)['input_ids'][0]
+        else:
+            self.tokenizer = None
+            self.tokenize_kargs = {}
+        self.use_grad = cfg.get('USE_GRAD', False)
+        self.clean = cfg.get('CLEAN', 'whitespace')
+    def freeze(self):
+        self.model = self.model.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+    # encode && encode_text
+    def forward(self, tokens, return_mask=False, use_mask=True):
+        # tokenization
+        embedding_context = nullcontext if self.use_grad else torch.no_grad
+        with embedding_context():
+            if use_mask:
+                x = self.model(tokens.input_ids.to(we.device_id),
+                               tokens.attention_mask.to(we.device_id))
+            else:
+                x = self.model(tokens.input_ids.to(we.device_id))
+            x = x.last_hidden_state
+            if return_mask:
+                return x.detach() + 0.0, tokens.attention_mask.to(we.device_id)
+            else:
+                return x.detach() + 0.0, None
+    def _clean(self, text):
+        if self.clean == 'whitespace':
+            text = whitespace_clean(basic_clean(text))
+        elif self.clean == 'lower':
+            text = whitespace_clean(basic_clean(text)).lower()
+        elif self.clean == 'canonicalize':
+            text = canonicalize(basic_clean(text))
+        elif self.clean == 'heavy':
+            text = heavy_clean(basic_clean(text))
+        return text
+    def encode(self, text, return_mask=False, use_mask=True):
+        if isinstance(text, str):
+            text = [text]
+        if self.clean:
+            text = [self._clean(u) for u in text]
+        assert self.tokenizer is not None
+        cont, mask = [], []
+        with torch.autocast(device_type='cuda',
+                            enabled=self.t5_dtype in ('float16', 'bfloat16'),
+                            dtype=getattr(torch, self.t5_dtype)):
+            for tt in text:
+                tokens = self.tokenizer([tt], **self.tokenize_kargs)
+                one_cont, one_mask = self(tokens,
+                                          return_mask=return_mask,
+                                          use_mask=use_mask)
+                cont.append(one_cont)
+                mask.append(one_mask)
+        if return_mask:
+            return torch.cat(cont, dim=0), torch.cat(mask, dim=0)
+        else:
+            return torch.cat(cont, dim=0)
+    def encode_list(self, text_list, return_mask=True):
+        cont_list = []
+        mask_list = []
+        for pp in text_list:
+            cont, cont_mask = self.encode(pp, return_mask=return_mask)
+            cont_list.append(cont)
+            mask_list.append(cont_mask)
+        if return_mask:
+            return cont_list, mask_list
+        else:
+            return cont_list
+    @staticmethod
+    def get_config_template():
+        return dict_to_yaml('MODELS',
+                            __class__.__name__,
+                            ACETextEmbedder.para_dict,
+                            set_name=True)
+@EMBEDDERS.register_class()
+class ACEHFEmbedder(BaseEmbedder):
+    para_dict = {
+        "HF_MODEL_CLS": {
+            "value": None,
+            "description": "huggingface cls in transfomer"
+        },
+        "MODEL_PATH": {
+            "value": None,
+            "description": "model folder path"
+        },
+        "HF_TOKENIZER_CLS": {
+            "value": None,
+            "description": "huggingface cls in transfomer"
+        },
+        "TOKENIZER_PATH": {
+            "value": None,
+            "description": "tokenizer folder path"
+        },
+        "MAX_LENGTH": {
+            "value": 77,
+            "description": "max length of input"
+        },
+        "OUTPUT_KEY": {
+            "value": "last_hidden_state",
+            "description": "output key"
+        },
+        "D_TYPE": {
+            "value": "float",
+            "description": "dtype"
+        },
+        "BATCH_INFER": {
+            "value": False,
+            "description": "batch infer"
+        }
+    }
+    para_dict.update(BaseEmbedder.para_dict)
+    def __init__(self, cfg, logger=None):
+        super().__init__(cfg, logger=logger)
+        hf_model_cls = cfg.get('HF_MODEL_CLS', None)
+        model_path = cfg.get("MODEL_PATH", None)
+        hf_tokenizer_cls = cfg.get('HF_TOKENIZER_CLS', None)
+        tokenizer_path = cfg.get('TOKENIZER_PATH', None)
+        self.max_length = cfg.get('MAX_LENGTH', 77)
+        self.output_key = cfg.get("OUTPUT_KEY", "last_hidden_state")
+        self.d_type = cfg.get("D_TYPE", "float")
+        self.clean = cfg.get("CLEAN", "whitespace")
+        self.batch_infer = cfg.get("BATCH_INFER", False)
+        self.added_identifier = cfg.get('ADDED_IDENTIFIER', None)
+        torch_dtype = getattr(torch, self.d_type)
+        assert hf_model_cls is not None and hf_tokenizer_cls is not None
+        assert model_path is not None and tokenizer_path is not None
+        with FS.get_dir_to_local_dir(tokenizer_path, wait_finish=True) as local_path:
+            self.tokenizer = getattr(transformers, hf_tokenizer_cls).from_pretrained(local_path,
+                                                                                     max_length = self.max_length,
+                                                                                     torch_dtype = torch_dtype,
+                                                                                     additional_special_tokens=self.added_identifier)
+        with FS.get_dir_to_local_dir(model_path, wait_finish=True) as local_path:
+            self.hf_module = getattr(transformers, hf_model_cls).from_pretrained(local_path, torch_dtype = torch_dtype)
+        self.hf_module = self.hf_module.eval().requires_grad_(False)
+    def forward(self, text: list[str], return_mask = False):
+        batch_encoding = self.tokenizer(
+            text,
+            truncation=True,
+            max_length=self.max_length,
+            return_length=False,
+            return_overflowing_tokens=False,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        outputs = self.hf_module(
+            input_ids=batch_encoding["input_ids"].to(self.hf_module.device),
+            attention_mask=None,
+            output_hidden_states=False,
+        )
+        if return_mask:
+            return outputs[self.output_key], batch_encoding['attention_mask'].to(self.hf_module.device)
+        else:
+            return outputs[self.output_key], None
+    def encode(self, text, return_mask = False):
+        if isinstance(text, str):
+            text = [text]
+        if self.clean:
+            text = [self._clean(u) for u in text]
+        if not self.batch_infer:
+            cont, mask = [], []
+            for tt in text:
+                one_cont, one_mask = self([tt], return_mask=return_mask)
+                cont.append(one_cont)
+                mask.append(one_mask)
+            if return_mask:
+                return torch.cat(cont, dim=0), torch.cat(mask, dim=0)
+            else:
+                return torch.cat(cont, dim=0)
+        else:
+            ret_data = self(text, return_mask = return_mask)
+            if return_mask:
+                return ret_data
+            else:
+                return ret_data[0]
+    def encode_list(self, text_list, return_mask=True):
+        cont_list = []
+        mask_list = []
+        for pp in text_list:
+            cont = self.encode(pp, return_mask=return_mask)
+            cont_list.append(cont[0]) if return_mask else cont_list.append(cont)
+            mask_list.append(cont[1]) if return_mask else mask_list.append(None)
+        if return_mask:
+            return cont_list, mask_list
+        else:
+            return cont_list
+    def encode_list_of_list(self, text_list, return_mask=True):
+        cont_list = []
+        mask_list = []
+        for pp in text_list:
+            cont = self.encode_list(pp, return_mask=return_mask)
+            cont_list.append(cont[0]) if return_mask else cont_list.append(cont)
+            mask_list.append(cont[1]) if return_mask else mask_list.append(None)
+        if return_mask:
+            return cont_list, mask_list
+        else:
+            return cont_list
+    def _clean(self, text):
+        if self.clean == 'whitespace':
+            text = whitespace_clean(basic_clean(text))
+        elif self.clean == 'lower':
+            text = whitespace_clean(basic_clean(text)).lower()
+        elif self.clean == 'canonicalize':
+            text = canonicalize(basic_clean(text))
+        return text
+    @staticmethod
+    def get_config_template():
+        return dict_to_yaml('EMBEDDER',
+                            __class__.__name__,
+                            ACEHFEmbedder.para_dict,
+                            set_name=True)
+@EMBEDDERS.register_class()
+class T5ACEPlusClipFluxEmbedder(BaseEmbedder):
+    """
+    Uses the OpenCLIP transformer encoder for text
+    """
+    para_dict = {
+        'T5_MODEL': {},
+        'CLIP_MODEL': {}
+    }
+    def __init__(self, cfg, logger=None):
+        super().__init__(cfg, logger=logger)
+        self.t5_model = EMBEDDERS.build(cfg.T5_MODEL, logger=logger)
+        self.clip_model = EMBEDDERS.build(cfg.CLIP_MODEL, logger=logger)
+    def encode(self, text, return_mask = False):
+        t5_embeds = self.t5_model.encode(text, return_mask = return_mask)
+        clip_embeds = self.clip_model.encode(text, return_mask = return_mask)
+        # change embedding strategy here
+        return {
+            'context': t5_embeds,
+            'y': clip_embeds,
+        }
+    def encode_list(self, text, return_mask = False):
+        t5_embeds = self.t5_model.encode_list(text, return_mask = return_mask)
+        clip_embeds = self.clip_model.encode_list(text, return_mask = return_mask)
+        # change embedding strategy here
+        return {
+            'context': t5_embeds,
+            'y': clip_embeds,
+        }
+    def encode_list_of_list(self, text, return_mask = False):
+        t5_embeds = self.t5_model.encode_list_of_list(text, return_mask = return_mask)
+        clip_embeds = self.clip_model.encode_list_of_list(text, return_mask = return_mask)
+        # change embedding strategy here
+        return {
+            'context': t5_embeds,
+            'y': clip_embeds,
+        }
+    @staticmethod
+    def get_config_template():
+        return dict_to_yaml('EMBEDDER',
+                            __class__.__name__,
+                            T5ACEPlusClipFluxEmbedder.para_dict,
+                            set_name=True)
+@BACKBONES.register_class()
+class Flux(BaseModel):
+    """
+    Transformer backbone Diffusion model with RoPE.
+    """
+    para_dict = {
+        "IN_CHANNELS": {
+            "value": 64,
+            "description": "model's input channels."
+        },
+        "OUT_CHANNELS": {
+            "value": 64,
+            "description": "model's output channels."
+        },
+        "HIDDEN_SIZE": {
+            "value": 1024,
+            "description": "model's hidden size."
+        },
+        "NUM_HEADS": {
+            "value": 16,
+            "description": "number of heads in the transformer."
+        },
+        "AXES_DIM": {
+            "value": [16, 56, 56],
+            "description": "dimensions of the axes of the positional encoding."
+        },
+        "THETA": {
+            "value": 10_000,
+            "description": "theta for positional encoding."
+        },
+        "VEC_IN_DIM": {
+            "value": 768,
+            "description": "dimension of the vector input."
+        },
+        "GUIDANCE_EMBED": {
+            "value": False,
+            "description": "whether to use guidance embedding."
+        },
+        "CONTEXT_IN_DIM": {
+            "value": 4096,
+            "description": "dimension of the context input."
+        },
+        "MLP_RATIO": {
+            "value": 4.0,
+            "description": "ratio of mlp hidden size to hidden size."
+        },
+        "QKV_BIAS": {
+            "value": True,
+            "description": "whether to use bias in qkv projection."
+        },
+        "DEPTH": {
+            "value": 19,
+            "description": "number of transformer blocks."
+        },
+        "DEPTH_SINGLE_BLOCKS": {
+            "value": 38,
+            "description": "number of transformer blocks in the single stream block."
+        },
+        "USE_GRAD_CHECKPOINT": {
+            "value": False,
+            "description": "whether to use gradient checkpointing."
+        },
+        "ATTN_BACKEND": {
+            "value": "pytorch",
+            "description": "backend for the transformer blocks, 'pytorch' or 'flash_attn'."
+        }
+    }
+    def __init__(
+            self,
+            cfg,
+            logger = None
+    ):
+        super().__init__(cfg, logger=logger)
+        self.in_channels = cfg.IN_CHANNELS
+        self.out_channels = cfg.get("OUT_CHANNELS", self.in_channels)
+        hidden_size = cfg.get("HIDDEN_SIZE", 1024)
+        num_heads = cfg.get("NUM_HEADS", 16)
+        axes_dim = cfg.AXES_DIM
+        theta = cfg.THETA
+        vec_in_dim = cfg.VEC_IN_DIM
+        self.guidance_embed = cfg.GUIDANCE_EMBED
+        context_in_dim = cfg.CONTEXT_IN_DIM
+        mlp_ratio = cfg.MLP_RATIO
+        qkv_bias = cfg.QKV_BIAS
+        depth = cfg.DEPTH
+        depth_single_blocks = cfg.DEPTH_SINGLE_BLOCKS
+        self.use_grad_checkpoint = cfg.get("USE_GRAD_CHECKPOINT", False)
+        self.attn_backend = cfg.get("ATTN_BACKEND", "pytorch")
+        self.lora_model = cfg.get("DIFFUSERS_LORA_MODEL", None)
+        self.swift_lora_model = cfg.get("SWIFT_LORA_MODEL", None)
+        self.pretrain_adapter = cfg.get("PRETRAIN_ADAPTER", None)
+        if hidden_size % num_heads != 0:
+            raise ValueError(
+                f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}"
+            )
+        pe_dim = hidden_size // num_heads
+        if sum(axes_dim) != pe_dim:
+            raise ValueError(f"Got {axes_dim} but expected positional dim {pe_dim}")
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.pe_embedder = EmbedND(dim=pe_dim, theta=theta, axes_dim= axes_dim)
+        self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
+        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+        self.vector_in = MLPEmbedder(vec_in_dim, self.hidden_size)
+        self.guidance_in = (
+            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if self.guidance_embed else nn.Identity()
+        )
+        self.txt_in = nn.Linear(context_in_dim, self.hidden_size)
+        self.double_blocks = nn.ModuleList(
+            [
+                DoubleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    backend=self.attn_backend
+                )
+                for _ in range(depth)
+            ]
+        )
+        self.single_blocks = nn.ModuleList(
+            [
+                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=mlp_ratio, backend=self.attn_backend)
+                for _ in range(depth_single_blocks)
+            ]
+        )
+        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
+    def prepare_input(self, x, context, y, x_shape=None):
+        # x.shape [6, 16, 16, 16] target is [6, 16, 768, 1360]
+        bs, c, h, w = x.shape
+        x = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
+        x_id = torch.zeros(h // 2, w // 2, 3)
+        x_id[..., 1] = x_id[..., 1] + torch.arange(h // 2)[:, None]
+        x_id[..., 2] = x_id[..., 2] + torch.arange(w // 2)[None, :]
+        x_ids = repeat(x_id, "h w c -> b (h w) c", b=bs)
+        txt_ids = torch.zeros(bs, context.shape[1], 3)
+        return x, x_ids.to(x), context.to(x), txt_ids.to(x), y.to(x), h, w
+    def unpack(self, x: Tensor, height: int, width: int) -> Tensor:
+        return rearrange(
+            x,
+            "b (h w) (c ph pw) -> b c (h ph) (w pw)",
+            h=math.ceil(height/2),
+            w=math.ceil(width/2),
+            ph=2,
+            pw=2,
+        )
+    def merge_diffuser_lora(self, ori_sd, lora_sd, scale = 1.0):
+        key_map = {
+            "single_blocks.{}.linear1.weight": {"key_list": [
+                ["transformer.single_transformer_blocks.{}.attn.to_q.lora_A.weight",
+                 "transformer.single_transformer_blocks.{}.attn.to_q.lora_B.weight"],
+                ["transformer.single_transformer_blocks.{}.attn.to_k.lora_A.weight",
+                 "transformer.single_transformer_blocks.{}.attn.to_k.lora_B.weight"],
+                ["transformer.single_transformer_blocks.{}.attn.to_v.lora_A.weight",
+                 "transformer.single_transformer_blocks.{}.attn.to_v.lora_B.weight"],
+                ["transformer.single_transformer_blocks.{}.proj_mlp.lora_A.weight",
+                 "transformer.single_transformer_blocks.{}.proj_mlp.lora_B.weight"]
+            ], "num": 38},
+            "single_blocks.{}.modulation.lin.weight": {"key_list": [
+                ["transformer.single_transformer_blocks.{}.norm.linear.lora_A.weight",
+                 "transformer.single_transformer_blocks.{}.norm.linear.lora_B.weight"],
+            ], "num": 38},
+            "single_blocks.{}.linear2.weight": {"key_list": [
+                ["transformer.single_transformer_blocks.{}.proj_out.lora_A.weight",
+                 "transformer.single_transformer_blocks.{}.proj_out.lora_B.weight"],
+            ], "num": 38},
+            "double_blocks.{}.txt_attn.qkv.weight": {"key_list": [
+                ["transformer.transformer_blocks.{}.attn.add_q_proj.lora_A.weight",
+                 "transformer.transformer_blocks.{}.attn.add_q_proj.lora_B.weight"],
+                ["transformer.transformer_blocks.{}.attn.add_k_proj.lora_A.weight",
+                 "transformer.transformer_blocks.{}.attn.add_k_proj.lora_B.weight"],
+                ["transformer.transformer_blocks.{}.attn.add_v_proj.lora_A.weight",
+                 "transformer.transformer_blocks.{}.attn.add_v_proj.lora_B.weight"],
+            ], "num": 19},
+            "double_blocks.{}.img_attn.qkv.weight": {"key_list": [
+                ["transformer.transformer_blocks.{}.attn.to_q.lora_A.weight",
+                 "transformer.transformer_blocks.{}.attn.to_q.lora_B.weight"],
+                ["transformer.transformer_blocks.{}.attn.to_k.lora_A.weight",
+                 "transformer.transformer_blocks.{}.attn.to_k.lora_B.weight"],
+                ["transformer.transformer_blocks.{}.attn.to_v.lora_A.weight",
+                 "transformer.transformer_blocks.{}.attn.to_v.lora_B.weight"],
+            ], "num": 19},
+            "double_blocks.{}.img_attn.proj.weight": {"key_list": [
+                ["transformer.transformer_blocks.{}.attn.to_out.0.lora_A.weight",
+                 "transformer.transformer_blocks.{}.attn.to_out.0.lora_B.weight"]
+            ], "num": 19},
+            "double_blocks.{}.txt_attn.proj.weight": {"key_list": [
+                ["transformer.transformer_blocks.{}.attn.to_add_out.lora_A.weight",
+                 "transformer.transformer_blocks.{}.attn.to_add_out.lora_B.weight"]
+            ], "num": 19},
+            "double_blocks.{}.img_mlp.0.weight": {"key_list": [
+                ["transformer.transformer_blocks.{}.ff.net.0.proj.lora_A.weight",
+                 "transformer.transformer_blocks.{}.ff.net.0.proj.lora_B.weight"]
+            ], "num": 19},
+            "double_blocks.{}.img_mlp.2.weight": {"key_list": [
+                ["transformer.transformer_blocks.{}.ff.net.2.lora_A.weight",
+                 "transformer.transformer_blocks.{}.ff.net.2.lora_B.weight"]
+            ], "num": 19},
+            "double_blocks.{}.txt_mlp.0.weight": {"key_list": [
+                ["transformer.transformer_blocks.{}.ff_context.net.0.proj.lora_A.weight",
+                 "transformer.transformer_blocks.{}.ff_context.net.0.proj.lora_B.weight"]
+            ], "num": 19},
+            "double_blocks.{}.txt_mlp.2.weight": {"key_list": [
+                ["transformer.transformer_blocks.{}.ff_context.net.2.lora_A.weight",
+                 "transformer.transformer_blocks.{}.ff_context.net.2.lora_B.weight"]
+            ], "num": 19},
+            "double_blocks.{}.img_mod.lin.weight": {"key_list": [
+                ["transformer.transformer_blocks.{}.norm1.linear.lora_A.weight",
+                 "transformer.transformer_blocks.{}.norm1.linear.lora_B.weight"]
+            ], "num": 19},
+            "double_blocks.{}.txt_mod.lin.weight": {"key_list": [
+                ["transformer.transformer_blocks.{}.norm1_context.linear.lora_A.weight",
+                 "transformer.transformer_blocks.{}.norm1_context.linear.lora_B.weight"]
+            ], "num": 19}
+        }
+        for k, v in key_map.items():
+            key_list = v["key_list"]
+            block_num = v["num"]
+            for block_id in range(block_num):
+                current_weight_list = []
+                for k_list in key_list:
+                    current_weight = torch.matmul(lora_sd[k_list[0].format(block_id)].permute(1, 0),
+                                                  lora_sd[k_list[1].format(block_id)].permute(1, 0)).permute(1, 0)
+                    current_weight_list.append(current_weight)
+                current_weight = torch.cat(current_weight_list, dim=0)
+                ori_sd[k.format(block_id)] += scale*current_weight
+        return ori_sd
+    def merge_swift_lora(self, ori_sd, lora_sd, scale = 1.0):
+        have_lora_keys = {}
+        for k, v in lora_sd.items():
+            k = k[len("model."):] if k.startswith("model.") else k
+            ori_key = k.split("lora")[0] + "weight"
+            if ori_key not in ori_sd:
+                raise f"{ori_key} should in the original statedict"
+            if ori_key not in have_lora_keys:
+                have_lora_keys[ori_key] = {}
+            if "lora_A" in k:
+                have_lora_keys[ori_key]["lora_A"] = v
+            elif "lora_B" in k:
+                have_lora_keys[ori_key]["lora_B"] = v
+            else:
+                raise NotImplementedError
+        for key, v in have_lora_keys.items():
+            current_weight = torch.matmul(v["lora_A"].permute(1, 0), v["lora_B"].permute(1, 0)).permute(1, 0)
+            ori_sd[key] += scale * current_weight
+        return ori_sd
+    def load_pretrained_model(self, pretrained_model):
+        if next(self.parameters()).device.type == 'meta':
+            map_location = we.device_id
+        else:
+            map_location = "cpu"
+        if self.lora_model is not None:
+            map_location = we.device_id
+        if pretrained_model is not None:
+            with FS.get_from(pretrained_model, wait_finish=True) as local_model:
+                if local_model.endswith('safetensors'):
+                    from safetensors.torch import load_file as load_safetensors
+                    sd = load_safetensors(local_model, device=map_location)
+                else:
+                    sd = torch.load(local_model, map_location=map_location)
+            if "state_dict" in sd:
+                sd = sd["state_dict"]
+            if "model" in sd:
+                sd = sd["model"]["model"]
+            if self.lora_model is not None:
+                with FS.get_from(self.lora_model, wait_finish=True) as local_model:
+                    if local_model.endswith('safetensors'):
+                        from safetensors.torch import load_file as load_safetensors
+                        lora_sd = load_safetensors(local_model, device=map_location)
+                    else:
+                        lora_sd = torch.load(local_model, map_location=map_location)
+                sd = self.merge_diffuser_lora(sd, lora_sd)
+            if self.swift_lora_model is not None:
+                with FS.get_from(self.swift_lora_model, wait_finish=True) as local_model:
+                    if local_model.endswith('safetensors'):
+                        from safetensors.torch import load_file as load_safetensors
+                        lora_sd = load_safetensors(local_model, device=map_location)
+                    else:
+                        lora_sd = torch.load(local_model, map_location=map_location)
+                sd = self.merge_swift_lora(sd, lora_sd)
+            adapter_ckpt = {}
+            if self.pretrain_adapter is not None:
+                with FS.get_from(self.pretrain_adapter, wait_finish=True) as local_adapter:
+                    if local_model.endswith('safetensors'):
+                        from safetensors.torch import load_file as load_safetensors
+                        adapter_ckpt = load_safetensors(local_adapter, device=map_location)
+                    else:
+                        adapter_ckpt = torch.load(local_adapter, map_location=map_location)
+            sd.update(adapter_ckpt)
+            new_ckpt = OrderedDict()
+            for k, v in sd.items():
+                if k in ("img_in.weight"):
+                    model_p = self.state_dict()[k]
+                    if v.shape != model_p.shape:
+                        model_p.zero_()
+                        model_p[:, :64].copy_(v[:, :64])
+                        new_ckpt[k] = torch.nn.parameter.Parameter(model_p)
+                    else:
+                        new_ckpt[k] = v
+                else:
+                    new_ckpt[k] = v
+            missing, unexpected = self.load_state_dict(new_ckpt, strict=False, assign=True)
+            self.logger.info(
+                f'Restored from {pretrained_model} with {len(missing)} missing and {len(unexpected)} unexpected keys'
+            )
+            if len(missing) > 0:
+                self.logger.info(f'Missing Keys:\n {missing}')
+            if len(unexpected) > 0:
+                self.logger.info(f'\nUnexpected Keys:\n {unexpected}')
+    def forward(
+        self,
+        x: Tensor,
+        t: Tensor,
+        cond: dict = {},
+        guidance: Tensor | None = None,
+        gc_seg: int = 0
+    ) -> Tensor:
+        x, x_ids, txt, txt_ids, y, h, w = self.prepare_input(x, cond["context"], cond["y"])
+        # running on sequences img
+        x = self.img_in(x)
+        vec = self.time_in(timestep_embedding(t, 256))
+        if self.guidance_embed:
+            if guidance is None:
+                raise ValueError("Didn't get guidance strength for guidance distilled model.")
+            vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
+        vec = vec + self.vector_in(y)
+        txt = self.txt_in(txt)
+        ids = torch.cat((txt_ids, x_ids), dim=1)
+        pe = self.pe_embedder(ids)
+        kwargs = dict(
+            vec=vec,
+            pe=pe,
+            txt_length=txt.shape[1],
+        )
+        x = torch.cat((txt, x), 1)
+        if self.use_grad_checkpoint and gc_seg >= 0:
+            x = checkpoint_sequential(
+                functions=[partial(block, **kwargs) for block in self.double_blocks],
+                segments=gc_seg if gc_seg > 0 else len(self.double_blocks),
+                input=x,
+                use_reentrant=False
+            )
+        else:
+            for block in self.double_blocks:
+                x = block(x, **kwargs)
+        kwargs = dict(
+            vec=vec,
+            pe=pe,
+        )
+        if self.use_grad_checkpoint and gc_seg >= 0:
+            x = checkpoint_sequential(
+                functions=[partial(block, **kwargs) for block in self.single_blocks],
+                segments=gc_seg if gc_seg > 0 else len(self.single_blocks),
+                input=x,
+                use_reentrant=False
+            )
+        else:
+            for block in self.single_blocks:
+                x = block(x, **kwargs)
+        x = x[:, txt.shape[1] :, ...]
+        x = self.final_layer(x, vec)  # (N, T, patch_size ** 2 * out_channels) 6 64 64
+        x = self.unpack(x, h, w)
+        return x
+    @staticmethod
+    def get_config_template():
+        return dict_to_yaml('MODEL',
+                            __class__.__name__,
+                            Flux.para_dict,
+                            set_name=True)
+@BACKBONES.register_class()
+class FluxMR(Flux):
+    def prepare_input(self, x, cond):
+        if isinstance(cond['context'], list):
+            context, y = torch.cat(cond["context"], dim=0).to(x), torch.cat(cond["y"], dim=0).to(x)
+        else:
+            context, y = cond['context'].to(x), cond['y'].to(x)
+        batch_frames, batch_frames_ids = [], []
+        for ix, shape in zip(x, cond["x_shapes"]):
+            # unpack image from sequence
+            ix = ix[:, :shape[0] * shape[1]].view(-1, shape[0], shape[1])
+            c, h, w = ix.shape
+            ix = rearrange(ix, "c (h ph) (w pw) -> (h w) (c ph pw)", ph=2, pw=2)
+            ix_id = torch.zeros(h // 2, w // 2, 3)
+            ix_id[..., 1] = ix_id[..., 1] + torch.arange(h // 2)[:, None]
+            ix_id[..., 2] = ix_id[..., 2] + torch.arange(w // 2)[None, :]
+            ix_id = rearrange(ix_id, "h w c -> (h w) c")
+            batch_frames.append([ix])
+            batch_frames_ids.append([ix_id])
+        x_list, x_id_list, mask_x_list, x_seq_length = [], [], [], []
+        for frames, frame_ids in zip(batch_frames, batch_frames_ids):
+            proj_frames = []
+            for idx, one_frame in enumerate(frames):
+                one_frame = self.img_in(one_frame)
+                proj_frames.append(one_frame)
+            ix = torch.cat(proj_frames, dim=0)
+            if_id = torch.cat(frame_ids, dim=0)
+            x_list.append(ix)
+            x_id_list.append(if_id)
+            mask_x_list.append(torch.ones(ix.shape[0]).to(ix.device, non_blocking=True).bool())
+            x_seq_length.append(ix.shape[0])
+        x = pad_sequence(tuple(x_list), batch_first=True)
+        x_ids = pad_sequence(tuple(x_id_list), batch_first=True).to(x)  # [b,pad_seq,2] pad (0.,0.) at dim2
+        mask_x = pad_sequence(tuple(mask_x_list), batch_first=True)
+        txt = self.txt_in(context)
+        txt_ids = torch.zeros(context.shape[0], context.shape[1], 3).to(x)
+        mask_txt = torch.ones(context.shape[0], context.shape[1]).to(x.device, non_blocking=True).bool()
+        return x, x_ids, txt, txt_ids, y, mask_x, mask_txt, x_seq_length
+    def unpack(self, x: Tensor, cond: dict = None, x_seq_length: list = None) -> Tensor:
+        x_list = []
+        image_shapes = cond["x_shapes"]
+        for u, shape, seq_length in zip(x, image_shapes, x_seq_length):
+            height, width = shape
+            h, w = math.ceil(height / 2), math.ceil(width / 2)
+            u = rearrange(
+                u[seq_length-h*w:seq_length, ...],
+                "(h w) (c ph pw) -> (h ph w pw) c",
+                h=h,
+                w=w,
+                ph=2,
+                pw=2,
+            )
+            x_list.append(u)
+        x = pad_sequence(tuple(x_list), batch_first=True).permute(0, 2, 1)
+        return x
+    def forward(
+            self,
+            x: Tensor,
+            t: Tensor,
+            cond: dict = {},
+            guidance: Tensor | None = None,
+            gc_seg: int = 0,
+            **kwargs
+    ) -> Tensor:
+        x, x_ids, txt, txt_ids, y, mask_x, mask_txt, seq_length_list = self.prepare_input(x, cond)
+        # running on sequences img
+        vec = self.time_in(timestep_embedding(t, 256))
+        if self.guidance_embed:
+            if guidance is None:
+                raise ValueError("Didn't get guidance strength for guidance distilled model.")
+            vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
+        vec = vec + self.vector_in(y)
+        ids = torch.cat((txt_ids, x_ids), dim=1)
+        pe = self.pe_embedder(ids)
+        mask_aside = torch.cat((mask_txt, mask_x), dim=1)
+        mask = mask_aside[:, None, :] * mask_aside[:, :, None]
+        kwargs = dict(
+            vec=vec,
+            pe=pe,
+            mask=mask,
+            txt_length = txt.shape[1],
+        )
+        x = torch.cat((txt, x), 1)
+        if self.use_grad_checkpoint and gc_seg >= 0:
+            x = checkpoint_sequential(
+                functions=[partial(block, **kwargs) for block in self.double_blocks],
+                segments=gc_seg if gc_seg > 0 else len(self.double_blocks),
+                input=x,
+                use_reentrant=False
+            )
+        else:
+            for block in self.double_blocks:
+                x = block(x, **kwargs)
+        kwargs = dict(
+            vec=vec,
+            pe=pe,
+            mask=mask,
+        )
+        if self.use_grad_checkpoint and gc_seg >= 0:
+            x = checkpoint_sequential(
+                functions=[partial(block, **kwargs) for block in self.single_blocks],
+                segments=gc_seg if gc_seg > 0 else len(self.single_blocks),
+                input=x,
+                use_reentrant=False
+            )
+        else:
+            for block in self.single_blocks:
+                x = block(x, **kwargs)
+        x = x[:, txt.shape[1]:, ...]
+        x = self.final_layer(x, vec)  # (N, T, patch_size ** 2 * out_channels) 6 64 64
+        x = self.unpack(x, cond, seq_length_list)
+        return x
+    @staticmethod
+    def get_config_template():
+        return dict_to_yaml('MODEL',
+                            __class__.__name__,
+                            FluxEdit.para_dict,
+                            set_name=True)
+@BACKBONES.register_class()
+class FluxEdit(FluxMR):
+    def prepare_input(self, x, cond, *args, **kwargs):
+        context, y = cond["context"], cond["y"]
+        batch_frames, batch_frames_ids, batch_shift = [], [], []
+        for ix, shape, is_align in zip(x, cond["x_shapes"], cond['align']):
+            # unpack image from sequence
+            ix = ix[:, :shape[0] * shape[1]].view(-1, shape[0], shape[1])
+            c, h, w = ix.shape
+            ix = rearrange(ix, "c (h ph) (w pw) -> (h w) (c ph pw)", ph=2, pw=2)
+            ix_id = torch.zeros(h // 2, w // 2, 3)
+            ix_id[..., 1] = ix_id[..., 1] + torch.arange(h // 2)[:, None]
+            ix_id[..., 2] = ix_id[..., 2] + torch.arange(w // 2)[None, :]
+            batch_shift.append(h // 2) #if is_align < 1 else batch_shift.append(0)
+            ix_id = rearrange(ix_id, "h w c -> (h w) c")
+            batch_frames.append([ix])
+            batch_frames_ids.append([ix_id])
+        if 'edit_x' in cond:
+            for i, edit in enumerate(cond['edit_x']):
+                if edit is None:
+                    continue
+                for ie in edit:
+                    ie = ie.squeeze(0)
+                    c, h, w = ie.shape
+                    ie = rearrange(ie, "c (h ph) (w pw) -> (h w) (c ph pw)", ph=2, pw=2)
+                    ie_id = torch.zeros(h // 2, w // 2, 3)
+                    ie_id[..., 1] = ie_id[..., 1] + torch.arange(batch_shift[i], h // 2 + batch_shift[i])[:, None]
+                    ie_id[..., 2] = ie_id[..., 2] + torch.arange(w // 2)[None, :]
+                    ie_id = rearrange(ie_id, "h w c -> (h w) c")
+                    batch_frames[i].append(ie)
+                    batch_frames_ids[i].append(ie_id)
+        x_list, x_id_list, mask_x_list, x_seq_length = [], [], [], []
+        for frames, frame_ids in zip(batch_frames, batch_frames_ids):
+            proj_frames = []
+            for idx, one_frame in enumerate(frames):
+                one_frame = self.img_in(one_frame)
+                proj_frames.append(one_frame)
+            ix = torch.cat(proj_frames, dim=0)
+            if_id = torch.cat(frame_ids, dim=0)
+            x_list.append(ix)
+            x_id_list.append(if_id)
+            mask_x_list.append(torch.ones(ix.shape[0]).to(ix.device, non_blocking=True).bool())
+            x_seq_length.append(ix.shape[0])
+        x = pad_sequence(tuple(x_list), batch_first=True)
+        x_ids = pad_sequence(tuple(x_id_list), batch_first=True).to(x)  # [b,pad_seq,2] pad (0.,0.) at dim2
+        mask_x = pad_sequence(tuple(mask_x_list), batch_first=True)
+        txt_list, mask_txt_list, y_list = [], [], []
+        for sample_id, (ctx, yy) in enumerate(zip(context, y)):
+            ctx_batch = []
+            for frame_id, one_ctx in enumerate(ctx):
+                one_ctx = self.txt_in(one_ctx.to(x))
+                ctx_batch.append(one_ctx)
+            txt_list.append(torch.cat(ctx_batch, dim=0))
+            mask_txt_list.append(torch.ones(txt_list[-1].shape[0]).to(ctx.device, non_blocking=True).bool())
+            y_list.append(yy.mean(dim = 0, keepdim=True))
+        txt = pad_sequence(tuple(txt_list), batch_first=True)
+        txt_ids = torch.zeros(txt.shape[0], txt.shape[1], 3).to(x)
+        mask_txt = pad_sequence(tuple(mask_txt_list), batch_first=True)
+        y = torch.cat(y_list, dim=0)
+        return x, x_ids, txt, txt_ids, y, mask_x, mask_txt, x_seq_length
+    def unpack(self, x: Tensor, cond: dict = None, x_seq_length: list = None) -> Tensor:
+        x_list = []
+        image_shapes = cond["x_shapes"]
+        for u, shape, seq_length in zip(x, image_shapes, x_seq_length):
+            height, width = shape
+            h, w = math.ceil(height / 2), math.ceil(width / 2)
+            u = rearrange(
+                u[:h*w, ...],
+                "(h w) (c ph pw) -> (h ph w pw) c",
+                h=h,
+                w=w,
+                ph=2,
+                pw=2,
+            )
+            x_list.append(u)
+        x = pad_sequence(tuple(x_list), batch_first=True).permute(0, 2, 1)
+        return x
+    def forward(
+            self,
+            x: Tensor,
+            t: Tensor,
+            cond: dict = {},
+            guidance: Tensor | None = None,
+            gc_seg: int = 0,
+            text_position_embeddings = None
+    ) -> Tensor:
+        x, x_ids, txt, txt_ids, y, mask_x, mask_txt, seq_length_list = self.prepare_input(x, cond, text_position_embeddings)
+        # running on sequences img
+        vec = self.time_in(timestep_embedding(t, 256))
+        if self.guidance_embed:
+            if guidance is None:
+                raise ValueError("Didn't get guidance strength for guidance distilled model.")
+            vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
+        vec = vec + self.vector_in(y)
+        ids = torch.cat((txt_ids, x_ids), dim=1)
+        pe = self.pe_embedder(ids)
+        mask_aside = torch.cat((mask_txt, mask_x), dim=1)
+        mask = mask_aside[:, None, :] * mask_aside[:, :, None]
+        kwargs = dict(
+            vec=vec,
+            pe=pe,
+            mask=mask,
+            txt_length = txt.shape[1],
+        )
+        x = torch.cat((txt, x), 1)
+        if self.use_grad_checkpoint and gc_seg >= 0:
+            x = checkpoint_sequential(
+                functions=[partial(block, **kwargs) for block in self.double_blocks],
+                segments=gc_seg if gc_seg > 0 else len(self.double_blocks),
+                input=x,
+                use_reentrant=False
+            )
+        else:
+            for block in self.double_blocks:
+                x = block(x, **kwargs)
+        kwargs = dict(
+            vec=vec,
+            pe=pe,
+            mask=mask,
+        )
+        if self.use_grad_checkpoint and gc_seg >= 0:
+            x = checkpoint_sequential(
+                functions=[partial(block, **kwargs) for block in self.single_blocks],
+                segments=gc_seg if gc_seg > 0 else len(self.single_blocks),
+                input=x,
+                use_reentrant=False
+            )
+        else:
+            for block in self.single_blocks:
+                x = block(x, **kwargs)
+        x = x[:, txt.shape[1]:, ...]
+        x = self.final_layer(x, vec)  # (N, T, patch_size ** 2 * out_channels) 6 64 64
+        x = self.unpack(x, cond, seq_length_list)
+        return x
+    @staticmethod
+    def get_config_template():
+        return dict_to_yaml('MODEL',
+                            __class__.__name__,
+                            FluxEdit.para_dict,
+                            set_name=True)

model/layers.py ADDED Viewed

	@@ -0,0 +1,356 @@

+from __future__ import annotations
+import math
+from dataclasses import dataclass
+from torch import Tensor, nn
+import torch
+from einops import rearrange, repeat
+from torch import Tensor
+from torch.nn.utils.rnn import pad_sequence
+try:
+    from flash_attn import (
+        flash_attn_varlen_func
+    )
+    FLASHATTN_IS_AVAILABLE = True
+except ImportError:
+    FLASHATTN_IS_AVAILABLE = False
+    flash_attn_varlen_func = None
+def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask: Tensor | None = None, backend = 'pytorch') -> Tensor:
+    q, k = apply_rope(q, k, pe)
+    if backend == 'pytorch':
+        if mask is not None and mask.dtype == torch.bool:
+            mask = torch.zeros_like(mask).to(q).masked_fill_(mask.logical_not(), -1e20)
+        x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=mask)
+        # x = torch.nan_to_num(x, nan=0.0, posinf=1e10, neginf=-1e10)
+        x = rearrange(x, "B H L D -> B L (H D)")
+    elif backend == 'flash_attn':
+        # q: (B, H, L, D)
+        # k: (B, H, S, D) now L = S
+        # v: (B, H, S, D)
+        b, h, lq, d = q.shape
+        _, _, lk, _ = k.shape
+        q = rearrange(q, "B H L D -> B L H D")
+        k = rearrange(k, "B H S D -> B S H D")
+        v = rearrange(v, "B H S D -> B S H D")
+        if mask is None:
+            q_lens = torch.tensor([lq] * b, dtype=torch.int32).to(q.device, non_blocking=True)
+            k_lens = torch.tensor([lk] * b, dtype=torch.int32).to(k.device, non_blocking=True)
+        else:
+            q_lens = torch.sum(mask[:, 0, :, 0], dim=1).int()
+            k_lens = torch.sum(mask[:, 0, 0, :], dim=1).int()
+        q = torch.cat([q_v[:q_l] for q_v, q_l in zip(q, q_lens)])
+        k = torch.cat([k_v[:k_l] for k_v, k_l in zip(k, k_lens)])
+        v = torch.cat([v_v[:v_l] for v_v, v_l in zip(v, k_lens)])
+        cu_seqlens_q = torch.cat([q_lens.new_zeros([1]), q_lens]).cumsum(0, dtype=torch.int32)
+        cu_seqlens_k = torch.cat([k_lens.new_zeros([1]), k_lens]).cumsum(0, dtype=torch.int32)
+        max_seqlen_q = q_lens.max()
+        max_seqlen_k = k_lens.max()
+        x = flash_attn_varlen_func(
+            q,
+            k,
+            v,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_k=max_seqlen_k
+        )
+        x_list = [x[cu_seqlens_q[i]:cu_seqlens_q[i+1]] for i in range(b)]
+        x = pad_sequence(tuple(x_list), batch_first=True)
+        x = rearrange(x, "B L H D -> B L (H D)")
+    else:
+        raise NotImplementedError
+    return x
+def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
+    assert dim % 2 == 0
+    scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
+    omega = 1.0 / (theta**scale)
+    out = torch.einsum("...n,d->...nd", pos, omega)
+    out = torch.stack([torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1)
+    out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2)
+    return out.float()
+def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor) -> tuple[Tensor, Tensor]:
+    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
+    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+    return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
+class EmbedND(nn.Module):
+    def __init__(self, dim: int, theta: int, axes_dim: list[int]):
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.axes_dim = axes_dim
+    def forward(self, ids: Tensor) -> Tensor:
+        n_axes = ids.shape[-1]
+        emb = torch.cat(
+            [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
+            dim=-3,
+        )
+        return emb.unsqueeze(1)
+def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0):
+    """
+    Create sinusoidal timestep embeddings.
+    :param t: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an (N, D) Tensor of positional embeddings.
+    """
+    t = time_factor * t
+    half = dim // 2
+    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(
+        t.device
+    )
+    args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    if torch.is_floating_point(t):
+        embedding = embedding.to(t)
+    return embedding
+class MLPEmbedder(nn.Module):
+    def __init__(self, in_dim: int, hidden_dim: int):
+        super().__init__()
+        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
+        self.silu = nn.SiLU()
+        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.out_layer(self.silu(self.in_layer(x)))
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.scale = nn.Parameter(torch.ones(dim))
+    def forward(self, x: Tensor):
+        x_dtype = x.dtype
+        x = x.float()
+        rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
+        return (x * rrms).to(dtype=x_dtype) * self.scale
+class QKNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.query_norm = RMSNorm(dim)
+        self.key_norm = RMSNorm(dim)
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple[Tensor, Tensor]:
+        q = self.query_norm(q)
+        k = self.key_norm(k)
+        return q.to(v), k.to(v)
+class SelfAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.norm = QKNorm(head_dim)
+        self.proj = nn.Linear(dim, dim)
+    def forward(self, x: Tensor, pe: Tensor, mask: Tensor | None = None) -> Tensor:
+        qkv = self.qkv(x)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = self.norm(q, k, v)
+        x = attention(q, k, v, pe=pe, mask=mask)
+        x = self.proj(x)
+        return x
+class CrossAttention(nn.Module):
+    def __init__(self, dim: int, context_dim: int, num_heads: int = 8, qkv_bias: bool = False):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.kv = nn.Linear(dim, context_dim * 2, bias=qkv_bias)
+        self.norm = QKNorm(head_dim)
+        self.proj = nn.Linear(dim, dim)
+    def forward(self, x: Tensor, context: Tensor, pe: Tensor, mask: Tensor | None = None) -> Tensor:
+        qkv = self.qkv(x)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = self.norm(q, k, v)
+        x = attention(q, k, v, pe=pe, mask=mask)
+        x = self.proj(x)
+        return x
+@dataclass
+class ModulationOut:
+    shift: Tensor
+    scale: Tensor
+    gate: Tensor
+class Modulation(nn.Module):
+    def __init__(self, dim: int, double: bool):
+        super().__init__()
+        self.is_double = double
+        self.multiplier = 6 if double else 3
+        self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
+    def forward(self, vec: Tensor) -> tuple[ModulationOut, ModulationOut | None]:
+        out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(self.multiplier, dim=-1)
+        return (
+            ModulationOut(*out[:3]),
+            ModulationOut(*out[3:]) if self.is_double else None,
+        )
+class DoubleStreamBlock(nn.Module):
+    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, backend = 'pytorch'):
+        super().__init__()
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.img_mod = Modulation(hidden_size, double=True)
+        self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
+        self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
+        )
+        self.backend = backend
+        self.txt_mod = Modulation(hidden_size, double=True)
+        self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
+        self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
+        )
+    def forward(self, x: Tensor, vec: Tensor, pe: Tensor, mask: Tensor = None, txt_length = None):
+        img_mod1, img_mod2 = self.img_mod(vec)
+        txt_mod1, txt_mod2 = self.txt_mod(vec)
+        txt, img = x[:, :txt_length], x[:, txt_length:]
+        # prepare image for attention
+        img_modulated = self.img_norm1(img)
+        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
+        img_qkv = self.img_attn.qkv(img_modulated)
+        img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
+        # prepare txt for attention
+        txt_modulated = self.txt_norm1(txt)
+        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
+        txt_qkv = self.txt_attn.qkv(txt_modulated)
+        txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
+        # run actual attention
+        q = torch.cat((txt_q, img_q), dim=2)
+        k = torch.cat((txt_k, img_k), dim=2)
+        v = torch.cat((txt_v, img_v), dim=2)
+        if mask is not None:
+            mask = repeat(mask, 'B L S->  B H L S', H=self.num_heads)
+        attn = attention(q, k, v, pe=pe, mask = mask, backend = self.backend)
+        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
+        # calculate the img bloks
+        img = img + img_mod1.gate * self.img_attn.proj(img_attn)
+        img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)
+        # calculate the txt bloks
+        txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
+        txt = txt + txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
+        x = torch.cat((txt, img), 1)
+        return x
+class SingleStreamBlock(nn.Module):
+    """
+    A DiT block with parallel linear layers as described in
+    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qk_scale: float | None = None,
+        backend='pytorch'
+    ):
+        super().__init__()
+        self.hidden_dim = hidden_size
+        self.num_heads = num_heads
+        head_dim = hidden_size // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        # qkv and mlp_in
+        self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
+        # proj and mlp_out
+        self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
+        self.norm = QKNorm(head_dim)
+        self.hidden_size = hidden_size
+        self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.mlp_act = nn.GELU(approximate="tanh")
+        self.modulation = Modulation(hidden_size, double=False)
+        self.backend = backend
+    def forward(self, x: Tensor, vec: Tensor, pe: Tensor, mask: Tensor = None) -> Tensor:
+        mod, _ = self.modulation(vec)
+        x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
+        qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = self.norm(q, k, v)
+        if mask is not None:
+            mask = repeat(mask, 'B L S->  B H L S', H=self.num_heads)
+        # compute attention
+        attn = attention(q, k, v, pe=pe, mask = mask, backend=self.backend)
+        # compute activation in mlp stream, cat again and run second linear layer
+        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
+        return x + mod.gate * output
+class LastLayer(nn.Module):
+    def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True))
+    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
+        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
+        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
+        x = self.linear(x)
+        return x

utils.py ADDED Viewed

	@@ -0,0 +1,95 @@

+#copyright (c) Alibaba, Inc. and its affiliates.
+import torch
+import torchvision.transforms as T
+from PIL import Image
+from torchvision.transforms.functional import InterpolationMode
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+def build_transform(input_size):
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    transform = T.Compose([
+        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+        T.Resize((input_size, input_size),
+                 interpolation=InterpolationMode.BICUBIC),
+        T.ToTensor(),
+        T.Normalize(mean=MEAN, std=STD)
+    ])
+    return transform
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height,
+                              image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+def dynamic_preprocess(image,
+                       min_num=1,
+                       max_num=12,
+                       image_size=448,
+                       use_thumbnail=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+    # calculate the existing image aspect ratio
+    target_ratios = set((i, j) for n in range(min_num, max_num + 1)
+                        for i in range(1, n + 1) for j in range(1, n + 1)
+                        if i * j <= max_num and i * j >= min_num)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio,
+                                                    target_ratios, orig_width,
+                                                    orig_height, image_size)
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = ((i % (target_width // image_size)) * image_size,
+               (i // (target_width // image_size)) * image_size,
+               ((i % (target_width // image_size)) + 1) * image_size,
+               ((i // (target_width // image_size)) + 1) * image_size)
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+def load_image(image_file, input_size=448, max_num=12):
+    if isinstance(image_file, str):
+        image = Image.open(image_file).convert('RGB')
+    else:
+        image = image_file
+    transform = build_transform(input_size=input_size)
+    images = dynamic_preprocess(image,
+                                image_size=input_size,
+                                use_thumbnail=True,
+                                max_num=max_num)
+    pixel_values = [transform(image) for image in images]
+    pixel_values = torch.stack(pixel_values)
+    return pixel_values