Spaces:

tsujuifu
/

ml-mgie

Running on Zero

App Files Files Community

tsujuifu commited on Feb 10, 2024

Commit

bdbb79e

0 Parent(s):

archive v1

Browse files

Files changed (27) hide show

.gitattributes +35 -0
README.md +10 -0
_input/0.jpg +0 -0
_input/1.jpg +0 -0
_input/10.jpg +0 -0
_input/11.jpg +0 -0
_input/12.jpg +0 -0
_input/13.jpg +0 -0
_input/14.jpg +0 -0
_input/15.jpg +0 -0
_input/16.jpg +0 -0
_input/17.jpg +0 -0
_input/18.jpg +0 -0
_input/19.jpg +0 -0
_input/2.jpg +0 -0
_input/3.jpg +0 -0
_input/4.jpg +0 -0
_input/5.jpg +0 -0
_input/6.jpg +0 -0
_input/7.jpg +0 -0
_input/8.jpg +0 -0
_input/9.jpg +0 -0
app.py +144 -0
llava.py +404 -0
pre-requirements.txt +9 -0
requirements.txt +4 -0
train.py +831 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+---
+title: MLLM-guided Image Editing (MGIE)
+emoji: 👩‍🎨
+colorFrom: blue
+colorTo: gray
+sdk: gradio
+sdk_version: 3.37.0
+app_file: app.py
+license: other
+---

_input/0.jpg ADDED Viewed

_input/1.jpg ADDED Viewed

_input/10.jpg ADDED Viewed

_input/11.jpg ADDED Viewed

_input/12.jpg ADDED Viewed

_input/13.jpg ADDED Viewed

_input/14.jpg ADDED Viewed

_input/15.jpg ADDED Viewed

_input/16.jpg ADDED Viewed

_input/17.jpg ADDED Viewed

_input/18.jpg ADDED Viewed

_input/19.jpg ADDED Viewed

_input/2.jpg ADDED Viewed

_input/3.jpg ADDED Viewed

_input/4.jpg ADDED Viewed

_input/5.jpg ADDED Viewed

_input/6.jpg ADDED Viewed

_input/7.jpg ADDED Viewed

_input/8.jpg ADDED Viewed

_input/9.jpg ADDED Viewed

app.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import os
+# os.system('cp -r ./_ckpt/LLaVA-7B-v1 /data/LLaVA-7B-v1'), os.system('cp -r ./_ckpt/mgie_7b /data/mgie_7b')
+os.system('ls /data'), os.system('df -h /data')
+[os.system('mv llava.py /home/user/.pyenv/versions/3.10.13/lib/python3.10/site-packages/llava/model/llava.py'),
+ os.system('mv train.py /home/user/.pyenv/versions/3.10.13/lib/python3.10/site-packages/llava/train/train.py')]
+from PIL import Image
+import numpy as np
+import torch as T
+import transformers, diffusers
+from llava.conversation import conv_templates
+from llava.model import *
+import gradio as gr
+def crop_resize(f, sz=512):
+    w, h = f.size
+    if w>h:
+        p = (w-h)//2
+        f = f.crop([p, 0, p+h, h])
+    elif h>w:
+        p = (h-w)//2
+        f = f.crop([0, p, w, p+w])
+    f = f.resize([sz, sz])
+    return f
+def remove_alter(s):  # hack expressive instruction
+    if 'ASSISTANT:' in s: s = s[s.index('ASSISTANT:')+10:].strip()
+    if '</s>' in s: s = s[:s.index('</s>')].strip()
+    if 'alternative' in s.lower(): s = s[:s.lower().index('alternative')]
+    if '[IMG0]' in s: s = s[:s.index('[IMG0]')]
+    s = '.'.join([s.strip() for s in s.split('.')[:2]])
+    if s[-1]!='.': s += '.'
+    return s.strip()
+DEFAULT_IMAGE_TOKEN = '<image>'
+DEFAULT_IMAGE_PATCH_TOKEN = '<im_patch>'
+DEFAULT_IM_START_TOKEN = '<im_start>'
+DEFAULT_IM_END_TOKEN = '<im_end>'
+PATH_LLAVA = '/data/LLaVA-7B-v1'
+tokenizer = transformers.AutoTokenizer.from_pretrained(PATH_LLAVA)
+model = LlavaLlamaForCausalLM.from_pretrained(PATH_LLAVA, low_cpu_mem_usage=True, torch_dtype=T.float16, use_cache=True).cuda()
+image_processor = transformers.CLIPImageProcessor.from_pretrained(model.config.mm_vision_tower, torch_dtype=T.float16)
+tokenizer.padding_side = 'left'
+tokenizer.add_tokens(['[IMG0]', '[IMG1]', '[IMG2]', '[IMG3]', '[IMG4]', '[IMG5]', '[IMG6]', '[IMG7]'], special_tokens=True)
+model.resize_token_embeddings(len(tokenizer))
+ckpt = T.load('/data/mgie_7b/mllm.pt', map_location='cpu')
+model.load_state_dict(ckpt, strict=False)
+mm_use_im_start_end = getattr(model.config, 'mm_use_im_start_end', False)
+tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+if mm_use_im_start_end: tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+vision_tower = model.get_model().vision_tower[0]
+vision_tower = transformers.CLIPVisionModel.from_pretrained(vision_tower.config._name_or_path, torch_dtype=T.float16, low_cpu_mem_usage=True).cuda()
+model.get_model().vision_tower[0] = vision_tower
+vision_config = vision_tower.config
+vision_config.im_patch_token = tokenizer.convert_tokens_to_ids([DEFAULT_IMAGE_PATCH_TOKEN])[0]
+vision_config.use_im_start_end = mm_use_im_start_end
+if mm_use_im_start_end: vision_config.im_start_token, vision_config.im_end_token = tokenizer.convert_tokens_to_ids([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN])
+image_token_len = (vision_config.image_size//vision_config.patch_size)**2
+_ = model.eval()
+EMB = ckpt['emb'].cuda()
+with T.inference_mode(): NULL = model.edit_head(T.zeros(1, 8, 4096).half().to('cuda'), EMB)
+pipe = diffusers.StableDiffusionInstructPix2PixPipeline.from_pretrained('timbrooks/instruct-pix2pix', torch_dtype=T.float16).to('cuda')
+pipe.set_progress_bar_config(disable=True)
+pipe.unet.load_state_dict(T.load('/data/mgie_7b/unet.pt', map_location='cpu'))
+print('--init MGIE--')
+def go_mgie(img, txt, seed, cfg_txt, cfg_img):
+    img, seed = crop_resize(Image.fromarray(img).convert('RGB')), int(seed)
+    inp = img
+    img = image_processor.preprocess(img, return_tensors='pt')['pixel_values'][0]
+    txt = "what will this image be like if '%s'"%(txt)
+    txt = txt+'\n'+DEFAULT_IM_START_TOKEN+DEFAULT_IMAGE_PATCH_TOKEN*image_token_len+DEFAULT_IM_END_TOKEN
+    conv = conv_templates['vicuna_v1_1'].copy()
+    conv.append_message(conv.roles[0], txt), conv.append_message(conv.roles[1], None)
+    txt = conv.get_prompt()
+    txt = tokenizer(txt)
+    txt, mask = T.as_tensor(txt['input_ids']), T.as_tensor(txt['attention_mask'])
+    with T.inference_mode():
+        out = model.generate(txt.unsqueeze(dim=0).cuda(), images=img.half().unsqueeze(dim=0).cuda(), attention_mask=mask.unsqueeze(dim=0).cuda(),
+                             do_sample=False, max_new_tokens=96, num_beams=1, no_repeat_ngram_size=3,
+                             return_dict_in_generate=True, output_hidden_states=True)
+        out, hid = out['sequences'][0].tolist(), T.cat([x[-1] for x in out['hidden_states']], dim=1)[0]
+        if 32003 in out: p = out.index(32003)-1
+        else: p = len(hid)-9
+        p = min(p, len(hid)-9)
+        hid = hid[p:p+8]
+        out = remove_alter(tokenizer.decode(out))
+        emb = model.edit_head(hid.unsqueeze(dim=0), EMB)
+        res = pipe(image=inp, prompt_embeds=emb, negative_prompt_embeds=NULL,
+                   generator=T.Generator(device='cuda').manual_seed(seed), guidance_scale=cfg_txt, image_guidance_scale=cfg_img).images[0]
+    return res, out
+def go_example(seed, cfg_txt, cfg_img):
+    txt = ['make the frame red', 'turn the day into night', 'give him a beard', 'make cottage a mansion',
+           'remove yellow object from dogs paws', 'change the hair from red to blue', 'remove the text', 'increase the image contrast',
+           'remove the people in the background', 'please make this photo professional looking', 'darken the image, sharpen it', 'photoshop the girl out',
+           'make more brightness', 'take away the brown filter form the image', 'add more contrast to simulate more light', 'dark on rgb',
+           'make the face happy', 'change view as ocean', 'replace basketball with soccer ball', 'let the floor be made of wood']
+    i = T.randint(len(txt), (1, )).item()
+    return './_input/%d.jpg'%(i), txt[i], seed, cfg_txt, cfg_img
+go_mgie(np.array(Image.open('./_input/0.jpg').convert('RGB')), 'make the frame red', 13331, 7.5, 1.5)
+print('--init GO--')
+with gr.Blocks() as app:
+    gr.Markdown(
+        """
+        🔔 we will have a maintenance at 3 a.m. (PST)
+        # [ICLR\'24] Guiding Instruction-based Image Editing via Multimodal Large Language Models<br>
+        🔔 this demo is hosted by [Tsu-Jui Fu](https://github.com/tsujuifu/pytorch_mgie)<br>
+        🔔 a black image means that the output did not pass the [safety checker](https://huggingface.co/CompVis/stable-diffusion-safety-checker)<br>
+        🔔 if the queue is full (*this app is too busy*), you can also try it [here](http://128.111.41.13:7122)<br>
+        🔔 if the building process takes too long, please try refreshing the page
+        """
+    )
+    with gr.Row(): inp, res = [gr.Image(height=384, width=384, label='Input Image', interactive=True),
+                               gr.Image(height=384, width=384, label='Goal Image', interactive=False)]
+    with gr.Row(): txt, out = [gr.Textbox(label='Instruction', interactive=True),
+                               gr.Textbox(label='Expressive Instruction', interactive=False)]
+    with gr.Row(): seed, cfg_txt, cfg_img = [gr.Number(value=13331, label='Seed', interactive=True),
+                                             gr.Number(value=7.5, label='Text CFG', interactive=True),
+                                             gr.Number(value=1.5, label='Image CFG', interactive=True)]
+    with gr.Row(): btn_sub, btn_exp = [gr.Button('Submit'),
+                                       gr.Button('Example')]
+    btn_sub.click(fn=go_mgie, inputs=[inp, txt, seed, cfg_txt, cfg_img], outputs=[res, out])
+    btn_exp.click(fn=go_example, inputs=[seed, cfg_txt, cfg_img], outputs=[inp, txt, seed, cfg_txt, cfg_img])
+app.queue(concurrency_count=1, max_size=75), app.launch()

llava.py ADDED Viewed

	@@ -0,0 +1,404 @@

+# modified from https://github.com/haotian-liu/LLaVA/blob/7ace501183c4bdec6052ec1a30039cdc3242a67c/llava/model/llava.py
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import CrossEntropyLoss
+from transformers import AutoConfig, AutoModelForCausalLM, \
+                         LlamaConfig, LlamaModel, LlamaForCausalLM, \
+                         CLIPVisionModel, CLIPImageProcessor
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+import os, diffusers
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+class LlavaConfig(LlamaConfig):
+    model_type = "llava"
+class LlavaLlamaModel(LlamaModel):
+    config_class = LlavaConfig
+    def __init__(self, config: LlamaConfig):
+        super(LlavaLlamaModel, self).__init__(config)
+        if hasattr(config, "mm_vision_tower"):
+            # HACK: for FSDP
+            self.vision_tower = [CLIPVisionModel.from_pretrained(config.mm_vision_tower)]
+            # self.vision_tower = CLIPVisionModel.from_pretrained(config.mm_vision_tower)
+        if hasattr(config, "use_mm_proj"):
+            self.mm_projector = nn.Linear(config.mm_hidden_size, config.hidden_size)
+    def get_vision_tower(self):
+        vision_tower = getattr(self, 'vision_tower', None)
+        if type(vision_tower) is list:
+            vision_tower = vision_tower[0]
+        return vision_tower
+    def initialize_vision_modules(self, vision_tower, mm_vision_select_layer,
+                                  pretrain_mm_mlp_adapter=None, fsdp=None):
+        self.config.mm_vision_tower = vision_tower
+        image_processor = CLIPImageProcessor.from_pretrained(vision_tower)
+        if not hasattr(self, 'vision_tower'):
+            vision_tower = CLIPVisionModel.from_pretrained(vision_tower)
+        else:
+            vision_tower = self.vision_tower[0]
+        vision_tower.requires_grad_(False)
+        if fsdp is not None and len(fsdp) > 0:
+            self.vision_tower = [vision_tower]
+        else:
+            self.vision_tower = vision_tower
+        vision_config = vision_tower.config
+        num_patches = (vision_config.image_size // vision_config.patch_size) ** 2
+        self.config.use_mm_proj = True
+        self.config.mm_hidden_size = vision_config.hidden_size
+        self.config.mm_vision_select_layer = mm_vision_select_layer
+        if not hasattr(self, 'mm_projector'):
+            self.mm_projector = nn.Linear(vision_config.hidden_size, self.config.hidden_size)
+        if pretrain_mm_mlp_adapter is not None:
+            mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
+            self.mm_projector.load_state_dict({k.split('.')[-1]: v for k, v in mm_projector_weights.items()})
+        return dict(
+            image_processor=image_processor,
+            image_token_len=num_patches,
+            vision_config=vision_config
+        )
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        # HACK: replace back original embeddings for LLaVA pretraining
+        orig_embeds_params = getattr(self, 'orig_embeds_params', None)
+        # if orig_embeds_params is not None:
+        #     orig_embeds_params = orig_embeds_params[0]
+        #     with torch.no_grad():
+        #         self.get_input_embeddings().weight.data[:-2] = orig_embeds_params[:-2].data
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        vision_tower = self.get_vision_tower()
+        if vision_tower is not None and (input_ids.shape[1] != 1 or self.training) and images is not None:
+            # TODO: this is a modified multimodal LLM -- Haotian Liu
+            with torch.no_grad():
+                if type(images) is list:
+                    # variable length images
+                    image_features = []
+                    for image in images:
+                        image_forward_out = vision_tower(image.unsqueeze(0), output_hidden_states=True)
+                        select_hidden_state_layer = getattr(self.config, "mm_vision_select_layer", -1)
+                        select_hidden_state = image_forward_out.hidden_states[select_hidden_state_layer]
+                        image_feature = select_hidden_state[:, 1:]
+                        image_features.append(image_feature)
+                else:
+                    image_forward_outs = vision_tower(images.to(vision_tower.dtype), output_hidden_states=True)
+                    select_hidden_state_layer = getattr(self.config, "mm_vision_select_layer", -1)
+                    select_hidden_state = image_forward_outs.hidden_states[select_hidden_state_layer]
+                    image_features = select_hidden_state[:, 1:].to(images.dtype)
+            if type(images) is list:
+                image_features = [self.mm_projector(image_feature)[0] for image_feature in image_features]
+            else:
+                image_features = self.mm_projector(image_features)
+            dummy_image_features = torch.zeros(256, 1024, device=inputs_embeds.device, dtype=inputs_embeds.dtype)
+            dummy_image_features = self.mm_projector(dummy_image_features)
+            new_input_embeds = []
+            cur_image_idx = 0
+            for cur_input_ids, cur_input_embeds in zip(input_ids, inputs_embeds):
+                if (cur_input_ids == vision_tower.config.im_patch_token).sum() == 0:
+                    # multimodal LLM, but the current sample is not multimodal
+                    cur_input_embeds = cur_input_embeds + (0. * dummy_image_features).sum()
+                    new_input_embeds.append(cur_input_embeds)
+                    cur_image_idx += 1
+                    continue
+                if vision_tower.config.use_im_start_end:
+                    cur_image_features = image_features[cur_image_idx]
+                    num_patches = cur_image_features.shape[0]
+                    if (cur_input_ids == vision_tower.config.im_start_token).sum() != (cur_input_ids == vision_tower.config.im_end_token).sum():
+                        raise ValueError("The number of image start tokens and image end tokens should be the same.")
+                    image_start_tokens = torch.where(cur_input_ids == vision_tower.config.im_start_token)[0]
+                    for image_start_token_pos in image_start_tokens:
+                        cur_image_features = image_features[cur_image_idx].to(device=cur_input_embeds.device)
+                        num_patches = cur_image_features.shape[0]
+                        if cur_input_ids[image_start_token_pos + num_patches + 1] != vision_tower.config.im_end_token:
+                            raise ValueError("The image end token should follow the image start token.")
+                        if orig_embeds_params is not None:
+                            cur_new_input_embeds = torch.cat((cur_input_embeds[:image_start_token_pos].detach(), cur_input_embeds[image_start_token_pos:image_start_token_pos+1], cur_image_features, cur_input_embeds[image_start_token_pos + num_patches + 1:image_start_token_pos + num_patches + 2], cur_input_embeds[image_start_token_pos + num_patches + 2:].detach()), dim=0)
+                        else:
+                            cur_new_input_embeds = torch.cat((cur_input_embeds[:image_start_token_pos+1], cur_image_features, cur_input_embeds[image_start_token_pos + num_patches + 1:]), dim=0)
+                        cur_image_idx += 1
+                    new_input_embeds.append(cur_new_input_embeds)
+                else:
+                    cur_image_features = image_features[cur_image_idx]
+                    num_patches = cur_image_features.shape[0]
+                    if (cur_input_ids == vision_tower.config.im_patch_token).sum() != num_patches:
+                        raise ValueError("The number of image patch tokens should be the same as the number of image patches.")
+                    masked_indices = torch.where(cur_input_ids == vision_tower.config.im_patch_token)[0]
+                    mask_index_start = masked_indices[0]
+                    if (masked_indices != torch.arange(mask_index_start, mask_index_start+num_patches, device=masked_indices.device, dtype=masked_indices.dtype)).any():
+                        raise ValueError("The image patch tokens should be consecutive.")
+                    if orig_embeds_params is not None:
+                        cur_new_input_embeds = torch.cat((cur_input_embeds[:mask_index_start].detach(), cur_image_features, cur_input_embeds[mask_index_start+num_patches:].detach()), dim=0)
+                    else:
+                        cur_new_input_embeds = torch.cat((cur_input_embeds[:mask_index_start], cur_image_features, cur_input_embeds[mask_index_start+num_patches:]), dim=0)
+                    new_input_embeds.append(cur_new_input_embeds)
+                    cur_image_idx += 1
+            inputs_embeds = torch.stack(new_input_embeds, dim=0)
+        return super(LlavaLlamaModel, self).forward(
+            input_ids=None, attention_mask=attention_mask, past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds, use_cache=use_cache,
+            output_attentions=output_attentions, output_hidden_states=output_hidden_states,
+            return_dict=return_dict
+        )
+class EditMapper(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.llm2hid = nn.Linear(4096, 512)
+        self.query = nn.Parameter(torch.randn(1, 77, 512))
+        self.mapper = nn.Transformer(batch_first=True, norm_first=True,
+                                     d_model=512, nhead=4, num_encoder_layers=4, num_decoder_layers=4,
+                                     dim_feedforward=2048, dropout=0.0)
+        self.hid2feat = nn.Linear(512, 768)
+    def forward(self, llm, emb):
+        hid = self.llm2hid(llm+emb)
+        hid = self.mapper(hid, self.query.repeat(llm.shape[0], 1, 1))
+        feat = self.hid2feat(hid)
+        return feat
+class LlavaLlamaForCausalLM(LlamaForCausalLM):
+    config_class = LlavaConfig
+    def __init__(self, config):
+        super(LlamaForCausalLM, self).__init__(config)
+        self.model = LlavaLlamaModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.edit_head = EditMapper()
+        '''self.scheduler, self.vae, self.unet = [diffusers.DDPMScheduler.from_pretrained('runwayml/stable-diffusion-v1-5', subfolder='scheduler'),
+                                               diffusers.AutoencoderKL.from_pretrained('runwayml/stable-diffusion-v1-5', subfolder='vae'),
+                                               diffusers.UNet2DConditionModel.from_pretrained('runwayml/stable-diffusion-v1-5', subfolder='unet')]
+        self.vae.requires_grad_(False)
+        self.unet.register_to_config(in_channels=8)
+        with torch.no_grad():
+            conv = torch.nn.Conv2d(8, self.unet.conv_in.out_channels, self.unet.conv_in.kernel_size, self.unet.conv_in.stride, self.unet.conv_in.padding)
+            conv.weight.zero_()
+            conv.weight[:, :4, :, :].copy_(self.unet.conv_in.weight)
+            self.unet.conv_in = conv'''
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_model(self):
+        return self.model
+    def get_vision_tower(self):
+        return self.get_model().get_vision_tower()
+    def get_vision_tower(self):
+        model = self.get_model()
+        vision_tower = model.vision_tower
+        if type(vision_tower) is list:
+            vision_tower = vision_tower[0]
+        return vision_tower
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+        p2p_inp=None, p2p_ans=None
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            images=images
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model/pipeline parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if labels is not None:
+            llm = []
+            for i in range(labels.shape[0]):
+                try: p = labels[i].data.cpu().tolist().index(32003)-1
+                except: p = len(labels[i])-9
+                p = min(len(hidden_states[i])-9, p)
+                llm.append(hidden_states[i][p:p+8].unsqueeze(0))
+            llm = torch.cat(llm, dim=0)
+            hid_edit = self.edit_head(llm, self.model.embed_tokens.weight[-8:].unsqueeze(dim=0).repeat(labels.shape[0], 1, 1))
+            B, DROP = labels.shape[0], 0.05
+            hid_null = self.edit_head(torch.zeros(B, 8, 4096, device=labels.device),
+                                      self.model.embed_tokens.weight[-8:].unsqueeze(dim=0).repeat(labels.shape[0], 1, 1))
+            with torch.no_grad():
+                lat_ans, lat_inp = self.vae.encode(p2p_ans).latent_dist.sample()*self.vae.config.scaling_factor, self.vae.encode(p2p_inp).latent_dist.mode()
+                lat_ans, lat_inp = [torch.from_numpy(lat_ans.data.cpu().float().numpy()).to(lat_ans.device),
+                                    torch.from_numpy(lat_inp.data.cpu().float().numpy()).to(lat_inp.device)]
+            noise = torch.randn_like(lat_ans)
+            ts = torch.randint(0, self.scheduler.config.num_train_timesteps, (B, ), device=noise.device).long()
+            lat_noise = self.scheduler.add_noise(lat_ans, noise, ts)
+            prob = torch.rand(B, device=lat_ans.device)
+            mask = (prob<(DROP*2)).reshape(B, 1, 1)
+            hid_edit = torch.where(mask, hid_null, hid_edit)
+            mask = (1.0-((prob>=DROP).to(lat_inp.dtype)*(prob<(DROP*3)).to(lat_inp.dtype))).reshape(B, 1, 1, 1)
+            lat_inp *= mask
+            out = self.unet(torch.cat([lat_noise, lat_inp], dim=1), ts, hid_edit).sample
+            loss_ce, loss_edit = loss, nn.functional.mse_loss(out, noise, reduction='mean')
+            if int(os.environ['LOCAL_RANK'])==0: print('loss_ce:', loss_ce, '/', 'loss_edit:', loss_edit)
+            loss = loss_ce+loss_edit*0.5
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "images": kwargs.get("images", None),
+            }
+        )
+        return model_inputs
+    def initialize_vision_tokenizer(self, mm_use_im_start_end, tokenizer, device,
+                                    tune_mm_mlp_adapter=False, pretrain_mm_mlp_adapter=None):
+        vision_config = self.get_vision_tower().config
+        vision_config.use_im_start_end = mm_use_im_start_end
+        tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+        self.resize_token_embeddings(len(tokenizer))
+        if mm_use_im_start_end:
+            num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+            self.resize_token_embeddings(len(tokenizer))
+            vision_config.im_start_token, vision_config.im_end_token = tokenizer.convert_tokens_to_ids([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN])
+            if num_new_tokens > 0:
+                input_embeddings = self.get_input_embeddings().weight.data
+                output_embeddings = self.get_output_embeddings().weight.data
+                input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True)
+                output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True)
+                input_embeddings[-num_new_tokens:] = input_embeddings_avg
+                output_embeddings[-num_new_tokens:] = output_embeddings_avg
+            if tune_mm_mlp_adapter:
+                self.get_model().orig_embeds_params = [self.get_input_embeddings().weight.data.clone().to(device=device)]
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = True
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False
+            if pretrain_mm_mlp_adapter:
+                mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
+                embed_tokens_weight = mm_projector_weights['model.embed_tokens.weight']
+                assert num_new_tokens == 2
+                if input_embeddings.shape == embed_tokens_weight.shape:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight[-num_new_tokens:]
+                elif embed_tokens_weight.shape[0] == num_new_tokens:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight
+                else:
+                    raise ValueError(f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}.")
+        vision_config.im_patch_token = tokenizer.convert_tokens_to_ids([DEFAULT_IMAGE_PATCH_TOKEN])[0]
+AutoConfig.register("llava", LlavaConfig)
+AutoModelForCausalLM.register(LlavaConfig, LlavaLlamaForCausalLM)

pre-requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+sentencepiece
+transformers
+diffusers
+tokenizers
+datasets
+accelerate
+evaluate
+gradio
+git+https://github.com/haotian-liu/LLaVA@7ace501

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+-i https://download.pytorch.org/whl/cu113
+torch==1.12.0
+torchvision==0.13.0
+torchaudio==0.12.0

train.py ADDED Viewed

	@@ -0,0 +1,831 @@

+# modified from https://github.com/haotian-liu/LLaVA/blob/7ace501183c4bdec6052ec1a30039cdc3242a67c/llava/train/train.py
+import os
+import copy
+from dataclasses import dataclass, field
+import json
+import logging
+import pathlib
+from typing import Dict, Optional, Sequence, List
+import torch
+import transformers
+from torch.utils.data import Dataset
+from llava.train.llava_trainer import LLaVATrainer
+from llava import conversation as conversation_lib
+from llava.model import *
+from PIL import Image
+import torch.nn as nn
+# TODO: import and use code from ../data/dataset.py
+IGNORE_INDEX = -100
+DEFAULT_PAD_TOKEN = "[PAD]"
+DEFAULT_EOS_TOKEN = "</s>"
+DEFAULT_BOS_TOKEN = "<s>"
+DEFAULT_UNK_TOKEN = "<unk>"
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+import io, base64, pickle, random
+from tqdm import tqdm
+import numpy as np
+def b2f(b): return Image.open(io.BytesIO(base64.b64decode(b))).convert('RGB')
+def resize(f):
+    w, h = f.size
+    if w>h:
+        p = (w-h)//2
+        f = f.crop([p, 0, p+h, h])
+    elif h>w:
+        p = (h-w)//2
+        f = f.crop([0, p, w, p+w])
+    f = f.resize([512, 512])
+    return f
+def img2npy(f): return (2.0*np.array(f)/255.0-1.0).transpose((2, 0, 1)).astype(np.float32)
+@dataclass
+class ModelArguments:
+    model_name_or_path: Optional[str] = field(default="facebook/opt-125m")
+    version: Optional[str] = field(default="v0")
+    freeze_backbone: bool = field(default=False)
+    tune_mm_mlp_adapter: bool = field(default=False)
+    vision_tower: Optional[str] = field(default=None)
+    mm_vision_select_layer: Optional[int] = field(default=-1)   # default to the last layer
+    pretrain_mm_mlp_adapter: Optional[str] = field(default=None)
+    mm_use_im_start_end: bool = field(default=False)
+@dataclass
+class DataArguments:
+    data_path: str = field(default=None,
+                           metadata={"help": "Path to the training data."})
+    lazy_preprocess: bool = False
+    is_multimodal: bool = False
+    sep_image_conv_front: bool = False
+    image_token_len: int = 0
+    image_folder: Optional[str] = field(default=None)
+    image_aspect_ratio: str = 'square'
+@dataclass
+class TrainingArguments(transformers.TrainingArguments):
+    cache_dir: Optional[str] = field(default=None)
+    optim: str = field(default="adamw_torch")
+    remove_unused_columns: bool = field(default=False)
+    freeze_mm_mlp_adapter: bool = field(default=False)
+    force_fsdp: bool = field(default=False)
+    model_max_length: int = field(
+        default=512,
+        metadata={
+            "help":
+            "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
+        },
+    )
+    double_quant: bool = field(
+        default=True,
+        metadata={"help": "Compress the quantization statistics through double quantization."}
+    )
+    quant_type: str = field(
+        default="nf4",
+        metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
+    )
+    bits: int = field(
+        default=16,
+        metadata={"help": "How many bits to use."}
+    )
+    lora_enable: bool = False
+    lora_r: int = 64
+    lora_alpha: int = 16
+    lora_dropout: float = 0.05
+    lora_weight_path: str = ""
+    lora_bias: str = "none"
+def maybe_zero_3(param, ignore_status=False, name=None):
+    from deepspeed import zero
+    from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+    if hasattr(param, "ds_id"):
+        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
+            if not ignore_status:
+                logging.warning(f"{name}: param.ds_status != ZeroParamStatus.NOT_AVAILABLE: {param.ds_status}")
+        with zero.GatheredParameters([param]):
+            param = param.data.detach().cpu().clone()
+    else:
+        param = param.detach().cpu().clone()
+    return param
+# Borrowed from peft.utils.get_peft_model_state_dict
+def get_peft_state_maybe_zero_3(named_params, bias):
+    if bias == "none":
+        to_return = {k: t for k, t in named_params if "lora_" in k}
+    elif bias == "all":
+        to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
+    elif bias == "lora_only":
+        to_return = {}
+        maybe_lora_bias = {}
+        lora_bias_names = set()
+        for k, t in named_params:
+            if "lora_" in k:
+                to_return[k] = t
+                bias_name = k.split("lora_")[0] + "bias"
+                lora_bias_names.add(bias_name)
+            elif "bias" in k:
+                maybe_lora_bias[k] = t
+        for k, t in maybe_lora_bias:
+            if bias_name in lora_bias_names:
+                to_return[bias_name] = t
+    else:
+        raise NotImplementedError
+    to_return = {k: maybe_zero_3(v, name=k) for k, v in to_return.items()}
+    return to_return
+def get_peft_state_non_lora_maybe_zero_3(named_params, require_grad_only=True):
+    to_return = {k: t for k, t in named_params if "lora_" not in k}
+    if require_grad_only:
+        to_return = {k: t for k, t in to_return.items() if t.requires_grad}
+    to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()}
+    return to_return
+def find_all_linear_names(model):
+    cls = torch.nn.Linear
+    lora_module_names = set()
+    for name, module in model.named_modules():
+        if isinstance(module, cls):
+            names = name.split('.')
+            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
+    if 'lm_head' in lora_module_names: # needed for 16-bit
+        lora_module_names.remove('lm_head')
+    return list(lora_module_names)
+def safe_save_model_for_hf_trainer(trainer: transformers.Trainer,
+                                   output_dir: str):
+    """Collects the state dict and dump to disk."""
+    if trainer.deepspeed:
+        torch.cuda.synchronize()
+        trainer.save_model(output_dir)
+        return
+    state_dict = trainer.model.state_dict()
+    if trainer.args.should_save:
+        cpu_state_dict = {
+            key: value.cpu()
+            for key, value in state_dict.items()
+        }
+        del state_dict
+        trainer._save(output_dir, state_dict=cpu_state_dict)  # noqa
+def smart_tokenizer_and_embedding_resize(
+    special_tokens_dict: Dict,
+    tokenizer: transformers.PreTrainedTokenizer,
+    model: transformers.PreTrainedModel,
+):
+    """Resize tokenizer and embedding.
+    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
+    """
+    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
+    model.resize_token_embeddings(len(tokenizer))
+    if num_new_tokens > 0:
+        input_embeddings = model.get_input_embeddings().weight.data
+        output_embeddings = model.get_output_embeddings().weight.data
+        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
+            dim=0, keepdim=True)
+        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
+            dim=0, keepdim=True)
+        input_embeddings[-num_new_tokens:] = input_embeddings_avg
+        output_embeddings[-num_new_tokens:] = output_embeddings_avg
+def _tokenize_fn(strings: Sequence[str],
+                 tokenizer: transformers.PreTrainedTokenizer) -> Dict:
+    """Tokenize a list of strings."""
+    tokenized_list = [
+        tokenizer(
+            text,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ) for text in strings
+    ]
+    input_ids = labels = [
+        tokenized.input_ids[0] for tokenized in tokenized_list
+    ]
+    input_ids_lens = labels_lens = [
+        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item()
+        for tokenized in tokenized_list
+    ]
+    return dict(
+        input_ids=input_ids,
+        labels=labels,
+        input_ids_lens=input_ids_lens,
+        labels_lens=labels_lens,
+    )
+def _mask_targets(target, tokenized_lens, speakers):
+    # cur_idx = 0
+    cur_idx = tokenized_lens[0]
+    tokenized_lens = tokenized_lens[1:]
+    target[:cur_idx] = IGNORE_INDEX
+    for tokenized_len, speaker in zip(tokenized_lens, speakers):
+        if speaker == "human":
+            target[cur_idx+2:cur_idx + tokenized_len] = IGNORE_INDEX
+        cur_idx += tokenized_len
+def _add_speaker_and_signal(header, source, get_conversation=True):
+    """Add speaker and start/end signal on each round."""
+    BEGIN_SIGNAL = "### "
+    END_SIGNAL = "\n"
+    conversation = header
+    for sentence in source:
+        from_str = sentence["from"]
+        if from_str.lower() == "human":
+            from_str = conversation_lib.default_conversation.roles[0]
+        elif from_str.lower() == "gpt":
+            from_str = conversation_lib.default_conversation.roles[1]
+        else:
+            from_str = 'unknown'
+        sentence["value"] = (BEGIN_SIGNAL + from_str + ": " +
+                             sentence["value"] + END_SIGNAL)
+        if get_conversation:
+            conversation += sentence["value"]
+    conversation += BEGIN_SIGNAL
+    return conversation
+def preprocess_multimodal(
+    sources: Sequence[str],
+    multimodal_cfg: dict,
+    cur_token_len: int,
+) -> Dict:
+    is_multimodal = multimodal_cfg['is_multimodal']
+    # image_token_len = multimodal_cfg['image_token_len']
+    image_token_len = cur_token_len
+    if not is_multimodal:
+        return sources
+    for source in sources:
+        if multimodal_cfg['sep_image_conv_front']:
+            assert DEFAULT_IMAGE_TOKEN in source[0]['value']
+            source[0]['value'] = source[0]['value'].replace(DEFAULT_IMAGE_TOKEN, '').strip()
+            source[0]['value'] = DEFAULT_IMAGE_TOKEN + conversation_lib.default_conversation.sep + conversation_lib.default_conversation.roles[0] + ": " + source[0]['value']
+        for sentence in source:
+            replace_token = DEFAULT_IMAGE_PATCH_TOKEN * image_token_len
+            if multimodal_cfg['use_im_start_end']:
+                replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
+            sentence["value"] = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, replace_token)
+    return sources
+def preprocess_v1(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+    # Tokenize conversations
+    input_ids = tokenizer(
+        conversations,
+        return_tensors="pt",
+        padding="longest",
+        max_length=tokenizer.model_max_length,
+        truncation=True,
+    ).input_ids
+    targets = input_ids.clone()
+    assert conv.sep_style == conversation_lib.SeparatorStyle.TWO
+    # Mask targets
+    sep = conv.sep + conv.roles[1] + ": "
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+        rounds = conversation.split(conv.sep2)
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+            round_len = len(tokenizer(rou).input_ids)
+            instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+def preprocess_mpt(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+    # Tokenize conversations
+    input_ids = tokenizer(
+        conversations,
+        return_tensors="pt",
+        padding="longest",
+        max_length=tokenizer.model_max_length,
+        truncation=True,
+    ).input_ids
+    targets = input_ids.clone()
+    assert conv.sep_style == conversation_lib.SeparatorStyle.MPT
+    # Mask targets
+    sep = conv.sep + conv.roles[1]
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+        rounds = conversation.split(conv.sep)
+        re_rounds = [conv.sep.join(rounds[:3])] # system + user + gpt
+        for conv_idx in range(3, len(rounds), 2):
+            re_rounds.append(conv.sep.join(rounds[conv_idx:conv_idx+2]))    # user + gpt
+        cur_len = 0
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(re_rounds):
+            if rou == "":
+                break
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+            round_len = len(tokenizer(rou).input_ids) + len(tokenizer(conv.sep).input_ids)
+            instruction_len = len(tokenizer(parts[0]).input_ids)
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+def preprocess(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+) -> Dict:
+    """
+    Given a list of sources, each is a conversation list. This transform:
+    1. Add signal '### ' at the beginning each sentence, with end signal '\n';
+    2. Concatenate conversations together;
+    3. Tokenize the concatenated conversation;
+    4. Make a deepcopy as the target. Mask human words with IGNORE_INDEX.
+    """
+    if conversation_lib.default_conversation.version == "v1":
+        return preprocess_v1(sources, tokenizer)
+    if conversation_lib.default_conversation.version == "mpt":
+        return preprocess_mpt(sources, tokenizer)
+    # add end signal and concatenate together
+    conversations = []
+    for source in sources:
+        header = f"{conversation_lib.default_conversation.system}\n\n"
+        conversation = _add_speaker_and_signal(header, source)
+        conversations.append(conversation)
+    # tokenize conversations
+    conversations_tokenized = _tokenize_fn(conversations, tokenizer)
+    input_ids = conversations_tokenized["input_ids"]
+    targets = copy.deepcopy(input_ids)
+    for target, source in zip(targets, sources):
+        tokenized_lens = _tokenize_fn([header] + [s["value"] for s in source],
+                                      tokenizer)["input_ids_lens"]
+        speakers = [sentence["from"] for sentence in source]
+        _mask_targets(target, tokenized_lens, speakers)
+    return dict(input_ids=input_ids, labels=targets)
+class SupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+    def __init__(self, data_path: str,
+                 tokenizer: transformers.PreTrainedTokenizer):
+        super(SupervisedDataset, self).__init__()
+        logging.warning("Loading data...")
+        list_data_dict = json.load(open(data_path, "r"))
+        logging.warning("Formatting inputs...")
+        sources = [example["conversations"] for example in list_data_dict]
+        data_dict = preprocess(sources, tokenizer)
+        self.input_ids = data_dict["input_ids"]
+        self.labels = data_dict["labels"]
+    def __len__(self):
+        return len(self.input_ids)
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        return dict(input_ids=self.input_ids[i], labels=self.labels[i])
+class LazySupervisedDataset(Dataset):
+    def __init__(self, data_path: str,
+                 tokenizer: transformers.PreTrainedTokenizer,
+                 multimodal_cfg: dict):
+        super(LazySupervisedDataset, self).__init__()
+        self.tokenizer, self.multimodal_cfg = tokenizer, multimodal_cfg
+        self.pkl, self.prompt = pickle.load(open('./_data/ipr2pr.pkl', 'rb'))['task'], json.load(open('./_data/ipr2pr_expressive.json', 'r'))
+        random.shuffle(self.pkl)
+        print('--pkl: %d--'%(len(self.pkl)))
+    def __len__(self):
+        return len(self.pkl)
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        item = self.pkl[i][0]
+        tsv = open('./_data/ipr2pr.tsv', 'r')
+        tsv.seek(item['lineidx'])
+        b = tsv.readline().strip().split('\t')
+        image = resize(b2f(b[0]))
+        processor = self.multimodal_cfg['image_processor']
+        image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+        cur_token_len = (image.shape[1]//14)*(image.shape[2]//14)
+        query = "what will this image be like if '%s'\n%s"%(item['instruction'], DEFAULT_IMAGE_TOKEN)
+        ans = '%s [IMG0] [IMG1] [IMG2] [IMG3] [IMG4] [IMG5] [IMG6] [IMG7]'%(self.prompt[item['input']]['expressive'])
+        sources = preprocess_multimodal(copy.deepcopy([[{'from': 'human', 'value': query}, {'from': 'gpt', 'value': ans}]]),
+                                        self.multimodal_cfg, cur_token_len)
+        data_dict = preprocess(sources, self.tokenizer)
+        if isinstance(i, int): data_dict = dict(input_ids=data_dict['input_ids'][0],
+                                                labels=data_dict['labels'][0])
+        data_dict['image'] = image
+        p2p_inp, p2p_ans = img2npy(resize(b2f(b[0])).resize([256, 256])), img2npy(resize(b2f(b[1])).resize([256, 256]))
+        data_dict['p2p_inp'], data_dict['p2p_ans'] = p2p_inp, p2p_ans
+        return data_dict
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+    """Collate examples for supervised fine-tuning."""
+    tokenizer: transformers.PreTrainedTokenizer
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels = tuple([instance[key] for instance in instances]
+                                  for key in ("input_ids", "labels"))
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids,
+            batch_first=True,
+            padding_value=self.tokenizer.pad_token_id)
+        labels = torch.nn.utils.rnn.pad_sequence(labels,
+                                                 batch_first=True,
+                                                 padding_value=IGNORE_INDEX)
+        batch = dict(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
+        )
+        if 'image' in instances[0]:
+            images = [instance['image'] for instance in instances]
+            if all(x is not None and x.shape == images[0].shape for x in images):
+                batch['images'] = torch.stack(images)
+            else:
+                batch['images'] = images
+        batch['p2p_inp'], batch['p2p_ans'] = [torch.cat([torch.from_numpy(d['p2p_inp']).unsqueeze(dim=0) for d in instances], dim=0),
+                                              torch.cat([torch.from_numpy(d['p2p_ans']).unsqueeze(dim=0) for d in instances], dim=0)]
+        return batch
+def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer,
+                                data_args) -> Dict:
+    """Make dataset and collator for supervised fine-tuning."""
+    dataset_cls = (LazySupervisedDataset
+                   if data_args.lazy_preprocess else SupervisedDataset)
+    train_dataset = dataset_cls(tokenizer=tokenizer,
+                                data_path=data_args.data_path,
+                                multimodal_cfg=dict(
+                                    is_multimodal=data_args.is_multimodal,
+                                    sep_image_conv_front=data_args.sep_image_conv_front,
+                                    image_token_len=data_args.image_token_len,
+                                    image_folder=data_args.image_folder,
+                                    image_aspect_ratio=data_args.image_aspect_ratio,
+                                    use_im_start_end=getattr(data_args, 'mm_use_im_start_end', False),
+                                    image_processor=getattr(data_args, 'image_processor', None)))
+    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
+    return dict(train_dataset=train_dataset,
+                eval_dataset=None,
+                data_collator=data_collator)
+def train():
+    parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
+    bnb_model_from_pretrained_args = {}
+    if training_args.bits in [4, 8]:
+        from transformers import BitsAndBytesConfig
+        from peft import prepare_model_for_int8_training
+        bnb_model_from_pretrained_args.update(dict(
+            device_map={"": training_args.device},
+            load_in_4bit=training_args.bits == 4,
+            load_in_8bit=training_args.bits == 8,
+            quantization_config=BitsAndBytesConfig(
+                load_in_4bit=training_args.bits == 4,
+                load_in_8bit=training_args.bits == 8,
+                llm_int8_threshold=6.0,
+                llm_int8_has_fp16_weight=False,
+                bnb_4bit_compute_dtype=compute_dtype,
+                bnb_4bit_use_double_quant=training_args.double_quant,
+                bnb_4bit_quant_type=training_args.quant_type # {'fp4', 'nf4'}
+            )
+        ))
+    if model_args.vision_tower is not None:
+        if 'mpt' in model_args.model_name_or_path:
+            model = LlavaMPTForCausalLM.from_pretrained(
+                model_args.model_name_or_path,
+                cache_dir=training_args.cache_dir,
+                **bnb_model_from_pretrained_args
+            )
+        else:
+            model = LlavaLlamaForCausalLM.from_pretrained(
+                model_args.model_name_or_path,
+                cache_dir=training_args.cache_dir,
+                **bnb_model_from_pretrained_args
+            )
+    else:
+        model = transformers.LlamaForCausalLM.from_pretrained(
+            model_args.model_name_or_path,
+            cache_dir=training_args.cache_dir,
+            **bnb_model_from_pretrained_args
+        )
+    model.config.use_cache = False
+    if model_args.freeze_backbone:
+        model.model.requires_grad_(False)
+    if training_args.bits in [4, 8]:
+        model.config.torch_dtype=(torch.float32 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
+        model = prepare_model_for_int8_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing)
+    if training_args.gradient_checkpointing and model_args.vision_tower is None:
+        if hasattr(model, "enable_input_require_grads"):
+            model.enable_input_require_grads()
+        else:
+            def make_inputs_require_grad(module, input, output):
+                output.requires_grad_(True)
+            model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+    if training_args.lora_enable:
+        from peft import LoraConfig, get_peft_model
+        lora_config = LoraConfig(
+            r=training_args.lora_r,
+            lora_alpha=training_args.lora_alpha,
+            target_modules=find_all_linear_names(model),
+            lora_dropout=training_args.lora_dropout,
+            bias=training_args.lora_bias,
+            task_type="CAUSAL_LM",
+        )
+        if training_args.bits == 16:
+            if training_args.bf16:
+                model.to(torch.bfloat16)
+            if training_args.fp16:
+                model.to(torch.float16)
+        logging.warning("Adding LoRA adapters...")
+        model = get_peft_model(model, lora_config)
+    if 'mpt' in model_args.model_name_or_path:
+        tokenizer = transformers.AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path,
+            cache_dir=training_args.cache_dir,
+            model_max_length=training_args.model_max_length,
+            padding_side="right"
+        )
+    else:
+        tokenizer = transformers.AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path,
+            cache_dir=training_args.cache_dir,
+            model_max_length=training_args.model_max_length,
+            padding_side="right",
+            use_fast=False,
+        )
+    if model_args.version == "v0":
+        if tokenizer.pad_token is None:
+            smart_tokenizer_and_embedding_resize(
+                special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),
+                tokenizer=tokenizer,
+                model=model,
+            )
+        if "llama" in model_args.model_name_or_path:
+            tokenizer.add_special_tokens({
+                "eos_token": DEFAULT_EOS_TOKEN,
+                "bos_token": DEFAULT_BOS_TOKEN,
+                "unk_token": DEFAULT_UNK_TOKEN,
+            })
+    else:
+        tokenizer.pad_token = tokenizer.unk_token
+        if "mpt" in model_args.model_name_or_path:
+            conversation_lib.default_conversation = conversation_lib.conv_templates["mpt"]
+        else:
+            conversation_lib.default_conversation = conversation_lib.conv_templates["vicuna_v1_1"]
+    if model_args.vision_tower is not None:
+        model_vision_dict = model.get_model().initialize_vision_modules(
+            vision_tower=model_args.vision_tower,
+            mm_vision_select_layer=model_args.mm_vision_select_layer,
+            pretrain_mm_mlp_adapter=model_args.pretrain_mm_mlp_adapter,
+            fsdp=training_args.fsdp
+        )
+        model.get_vision_tower().to(dtype=torch.float16, device=training_args.device)
+        vision_config = model_vision_dict['vision_config']
+        data_args.image_token_len = model_vision_dict['image_token_len']
+        data_args.image_processor = model_vision_dict['image_processor']
+        data_args.is_multimodal = True
+        model.config.tune_mm_mlp_adapter = training_args.tune_mm_mlp_adapter = model_args.tune_mm_mlp_adapter
+        if model_args.tune_mm_mlp_adapter:
+            model.requires_grad_(False)
+            for p in model.get_model().mm_projector.parameters():
+                p.requires_grad = True
+        model.config.freeze_mm_mlp_adapter = training_args.freeze_mm_mlp_adapter
+        if training_args.freeze_mm_mlp_adapter:
+            for p in model.get_model().mm_projector.parameters():
+                p.requires_grad = False
+        if training_args.bits in [4, 8]:
+            model.get_model().mm_projector.to(dtype=compute_dtype, device=training_args.device)
+        model.config.mm_use_im_start_end = data_args.mm_use_im_start_end = model_args.mm_use_im_start_end
+        vision_config.use_im_start_end = training_args.use_im_start_end = model_args.mm_use_im_start_end
+        model.config.sep_image_conv_front = data_args.sep_image_conv_front
+        model.initialize_vision_tokenizer(mm_use_im_start_end=model_args.mm_use_im_start_end, tokenizer=tokenizer, device=training_args.device,
+                                          tune_mm_mlp_adapter=model_args.tune_mm_mlp_adapter, pretrain_mm_mlp_adapter=model_args.pretrain_mm_mlp_adapter)
+        params_no_grad = [n for n, p in model.named_parameters() if not p.requires_grad]
+        if len(params_no_grad) > 0:
+            if training_args.fsdp is not None and len(training_args.fsdp) > 0:
+                if len(params_no_grad) < 10:
+                    print('[WARNING] Attempting to use FSDP while {} parameters do not require gradients: {}'. format(len(params_no_grad), params_no_grad))
+                else:
+                    print('[WARNING] Attempting to use FSDP while {} parameters do not require gradients: {}...(omitted)'. format(len(params_no_grad), ', '.join(params_no_grad[:10])))
+                print("[WARNING] Attempting to use FSDP with partially frozen paramters, this is experimental.")
+                print("[WARNING] As of 4/30/23, this feature requires PyTorch-nightly build.  See here for details: https://github.com/haotian-liu/LLaVA#experimental-use-fsdp-to-save-memory-in-pretraining")
+                from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
+                def patch_FSDP_use_orig_params(func):
+                    def wrap_func(*args, **kwargs):
+                        use_orig_params = kwargs.pop('use_orig_params', True)
+                        return func(*args, **kwargs, use_orig_params=use_orig_params)
+                    return wrap_func
+                FSDP.__init__ = patch_FSDP_use_orig_params(FSDP.__init__)
+    if training_args.bits in [4, 8]:
+        from peft.tuners.lora import LoraLayer
+        for name, module in model.named_modules():
+            if isinstance(module, LoraLayer):
+                if training_args.bf16:
+                    module = module.to(torch.bfloat16)
+            if 'norm' in name:
+                module = module.to(torch.float32)
+            if 'lm_head' in name or 'embed_tokens' in name:
+                if hasattr(module, 'weight'):
+                    if training_args.bf16 and module.weight.dtype == torch.float32:
+                        module = module.to(torch.bfloat16)
+    # start for MGIE
+    os.makedirs('_log', exist_ok=True)
+    pt = {}
+    for i in tqdm(range(2)): pt.update(torch.load('./_ckpt/LLaVA-7B-v1/pytorch_model-0000%d-of-00002.bin'%(i+1), map_location='cpu'))
+    miss, unexp = model.load_state_dict(pt, strict=False)
+    print('miss:', miss), print('unexp:', unexp)
+    tokenizer.add_tokens(['[IMG0]', '[IMG1]', '[IMG2]', '[IMG3]', '[IMG4]', '[IMG5]', '[IMG6]', '[IMG7]'], special_tokens=True)
+    model.resize_token_embeddings(len(tokenizer))
+    print(tokenizer), json.dump(tokenizer.get_vocab(), open('_log/vocabs.json', 'w'), indent=2)
+    for n, p in model.named_parameters():
+        if 'embed_tokens' in n or 'lm_head' in n or 'edit_head' in n or 'unet' in n: p.requires_grad = True
+        else: p.requires_grad = False
+    with open('_log/parameters.txt', 'w') as F:
+        for n, p in model.named_parameters(): F.write('%s %s %s\n'%(n, str(p.shape), str(p.requires_grad)))
+    with open('_log/args_train.txt', 'w') as F:
+        for key in vars(training_args): F.write('%s: %s\n'%(str(key), str(vars(training_args)[key])))
+    # end for MGIE
+    data_module = make_supervised_data_module(tokenizer=tokenizer,
+                                              data_args=data_args)
+    trainer = LLaVATrainer(model=model,
+                    tokenizer=tokenizer,
+                    args=training_args,
+                    **data_module)
+    if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
+        trainer.train(resume_from_checkpoint=True)
+    else:
+        trainer.train()
+    trainer.save_state()
+    if training_args.lora_enable:
+        state_dict = get_peft_state_maybe_zero_3(
+            model.named_parameters(), training_args.lora_bias
+        )
+        non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3(
+            model.named_parameters()
+        )
+        if training_args.local_rank == 0 or training_args.local_rank == -1:
+            model.config.save_pretrained(training_args.output_dir)
+            model.save_pretrained(training_args.output_dir, state_dict=state_dict)
+            torch.save(non_lora_state_dict, os.path.join(training_args.output_dir, 'non_lora_trainables.bin'))
+    else:
+        safe_save_model_for_hf_trainer(trainer=trainer,
+                                       output_dir=training_args.output_dir)
+if __name__ == "__main__":
+    train()