Upload 12 files

Browse files

Files changed (12) hide show

.DS_Store +0 -0
LICENSE +201 -0
README.md +59 -0
cahceclear.py +35 -0
default_config.yaml +16 -0
gradio_demo_full.py +168 -0
infer_full.py +143 -0
requirements.txt +147 -0
train_stage1.py +715 -0
train_stage1.sh +21 -0
train_stage2.py +816 -0
train_stage2.sh +23 -0

.DS_Store ADDED Viewed

Binary file (8.2 kB). View file

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,59 @@

+# Official Repo for Stable-Hair
+<a href='https://xiaojiu-z.github.io/Stable-Hair.github.io/'><img src='https://img.shields.io/badge/Project-Page-green'></a>
+<a href='https://arxiv.org/pdf/2407.14078'><img src='https://img.shields.io/badge/Technique-Report-red'></a>
+**Stable-Hair: Real-World Hair Transfer via Diffusion Model**
+*Yuxuan Zhang, Qing Zhang, Yiren Song, Jiaming Liu*
+<img src='assets/teaser_.jpg'>
+## Abstract
+Current hair transfer methods struggle to handle diverse and intricate hairstyles, limiting their applicability in real-world scenarios. In this paper, we propose a novel diffusion-based hair transfer framework, named Stable-Hair, which robustly transfers a wide range of real-world hairstyles to user-provided faces for virtual hair try-on. To achieve this goal, our Stable-Hair framework is designed as a two-stage pipeline. In the first stage, we train a Bald Converter alongside stable diffusion to remove hair from the user-provided face images, resulting in bald images. In the second stage, we specifically designed a Hair Extractor and a Latent IdentityNet to transfer the target hairstyle with highly detailed and high-fidelity to the bald image. The Hair Extractor is trained to encode reference images with the desired hairstyles, while the Latent IdentityNet ensures consistency in identity and background. To minimize color deviations between source images and transfer results, we introduce a novel Latent ControlNet architecture, which functions as both the Bald Converter and Latent IdentityNet. After training on our curated triplet dataset, our method accurately transfers highly detailed and high-fidelity hairstyles to the source images. Extensive experiments demonstrate that our approach achieves state-of-the-art performance compared to existing hair transfer methods.
+<img src='assets/method.jpg'>
+## Todo List
+1. - [x] Stage1 inference code
+2. - [x] Stage1 pre-trained weights
+3. - [x] Stage2 inference code
+4. - [x] Stage2 pre-trained weights
+5. - [x] Training code
+## Getting Started
+### Environment Setup
+Our code is built on the [diffusers](https://github.com/huggingface/diffusers/) version of Stable Diffusion, In our experiments, we use model version v1-5. If you have a diffusers version of [controlnet](https://huggingface.co/docs/diffusers/using-diffusers/controlnet) configured, you can use this repository directly.
+```shell
+git clone https://github.com/Xiaojiu-z/Stable-Hair.git
+cd Stable-Hair
+```
+### Pretrained Models
+[Google Drive](https://drive.google.com/drive/folders/1E-8Udfw8S8IorCWhBgS4FajIbqlrWRbQ?usp=drive_link).
+Download them and save to the directory `models/stage1` and `models/stage2`.
+### Inference
+```python
+python infer_full.py
+```
+### Gradio demo
+We provide a simple gr demo for more flexible use.
+```python
+python gradio_demo_full.py
+```
+### Limitation
+The Results are influenced by the effect of the first stage, if the bald converter does not go well, the effect of hair transfer is not good.
+By the way, this released model only trained on a small dataset (6k images for stage1, 2w images for stage2) and all the face images are from FFHQ datasets which means they were cropped and aligned. From my practice, using video and full body datasets to finetune this model is a good way to improve, due to some restrictions the improved model will not be publicly available. If you want to expand the border of this model, feel free to contact me.
+## Cite
+```
+@misc{zhang2024stablehairrealworldhairtransfer,
+      title={Stable-Hair: Real-World Hair Transfer via Diffusion Model},
+      author={Yuxuan Zhang and Qing Zhang and Yiren Song and Jiaming Liu},
+      year={2024},
+      eprint={2407.14078},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2407.14078},
+}
+```

cahceclear.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import os
+import gc
+import torch
+import psutil
+def clear_system_memory():
+    # Clear Python cache
+    print("Clearing Python cache...")
+    gc.collect()
+    # Clear GPU cache if PyTorch is being used
+    if torch.cuda.is_available():
+        print("Clearing GPU cache...")
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
+    # Clear RAM (Only works on Linux/Mac for now)
+    if os.name == 'posix':
+        print("Clearing RAM caches...")
+        os.system('sync && echo 3 > /proc/sys/vm/drop_caches')
+    # List and terminate high-memory processes (use with caution)
+    print("Killing high-memory processes...")
+    for proc in psutil.process_iter(['pid', 'name', 'memory_info']):
+        try:
+            if proc.info['memory_info'].rss > 500 * 1024 * 1024:  # Threshold: 500MB
+                print(f"Killing process {proc.info['name']} (PID: {proc.info['pid']})")
+                proc.terminate()
+        except (psutil.NoSuchProcess, psutil.AccessDenied):
+            continue
+    print("Memory cleared successfully!")
+# Execute the cleaning process
+clear_system_memory()

default_config.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: MULTI_GPU
+main_process_port: 17362
+downcast_bf16: 'no'
+gpu_ids: 0,1,2,3
+machine_rank: 0
+main_training_function: main
+mixed_precision: fp16
+num_machines: 1
+num_processes: 4
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

gradio_demo_full.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import gradio as gr
+import torch
+from PIL import Image
+import numpy as np
+from PIL import Image
+from omegaconf import OmegaConf
+import os
+import cv2
+from diffusers import DDIMScheduler, UniPCMultistepScheduler
+from diffusers.models import UNet2DConditionModel
+from ref_encoder.latent_controlnet import ControlNetModel
+from ref_encoder.adapter import *
+from ref_encoder.reference_unet import ref_unet
+from utils.pipeline import StableHairPipeline
+from utils.pipeline_cn import StableDiffusionControlNetPipeline
+torch.cuda.set_per_process_memory_fraction(0.80, device="cuda:0")
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+class StableHair:
+    def __init__(self, config="./configs/hair_transfer.yaml", device=device, weight_dtype=torch.float32) -> None:
+        print("Initializing Stable Hair Pipeline...")
+        self.config = OmegaConf.load(config)
+        self.device = device
+        ### Load vae controlnet
+        unet = UNet2DConditionModel.from_pretrained(self.config.pretrained_model_path, subfolder="unet").to(device)
+        controlnet = ControlNetModel.from_unet(unet).to(device)
+        _state_dict = torch.load(os.path.join(self.config.pretrained_folder, self.config.controlnet_path))
+        controlnet.load_state_dict(_state_dict, strict=False)
+        controlnet.to(weight_dtype)
+        ### >>> create pipeline >>> ###
+        self.pipeline = StableHairPipeline.from_pretrained(
+            self.config.pretrained_model_path,
+            controlnet=controlnet,
+            safety_checker=None,
+            torch_dtype=weight_dtype,
+        ).to(device)
+        self.pipeline.scheduler = DDIMScheduler.from_config(self.pipeline.scheduler.config)
+        ### load Hair encoder/adapter
+        self.hair_encoder.to("cpu")
+        self.hair_encoder = ref_unet.from_pretrained(self.config.pretrained_model_path, subfolder="unet").to(device)
+        _state_dict = torch.load(os.path.join(self.config.pretrained_folder, self.config.encoder_path))
+        self.hair_encoder.load_state_dict(_state_dict, strict=False)
+        self.hair_adapter = adapter_injection(self.pipeline.unet, device=self.device, dtype=torch.float32, use_resampler=False)
+        _state_dict = torch.load(os.path.join(self.config.pretrained_folder, self.config.adapter_path))
+        self.hair_adapter.load_state_dict(_state_dict, strict=False)
+        ### load bald converter
+        bald_converter = ControlNetModel.from_unet(unet).to(device)
+        _state_dict = torch.load(self.config.bald_converter_path)
+        bald_converter.load_state_dict(_state_dict, strict=False)
+        bald_converter.to(dtype=weight_dtype)
+        del unet
+        ### create pipeline for hair removal
+        self.remove_hair_pipeline = StableDiffusionControlNetPipeline.from_pretrained(
+            self.config.pretrained_model_path,
+            controlnet=bald_converter,
+            safety_checker=None,
+            torch_dtype=weight_dtype,
+        )
+        self.remove_hair_pipeline.scheduler = UniPCMultistepScheduler.from_config(self.remove_hair_pipeline.scheduler.config)
+        self.remove_hair_pipeline = self.remove_hair_pipeline.to(device)
+        ### move to fp16
+        self.hair_encoder.to(weight_dtype)
+        self.hair_adapter.to(weight_dtype)
+        print("Initialization Done!")
+    def Hair_Transfer(self, source_image, reference_image, random_seed, step, guidance_scale, scale, controlnet_conditioning_scale):
+        prompt = ""
+        n_prompt = ""
+        random_seed = int(random_seed)
+        step = int(step)
+        guidance_scale = float(guidance_scale)
+        scale = float(scale)
+        controlnet_conditioning_scale = float(controlnet_conditioning_scale)
+        # load imgs
+        H, W, C = source_image.shape
+        # generate images
+        set_scale(self.pipeline.unet, scale)
+        generator = torch.Generator(device="cuda")
+        generator.manual_seed(random_seed)
+        sample = self.pipeline(
+            prompt,
+            negative_prompt=n_prompt,
+            num_inference_steps=step,
+            guidance_scale=guidance_scale,
+            width=W,
+            height=H,
+            controlnet_condition=source_image,
+            controlnet_conditioning_scale=controlnet_conditioning_scale,
+            generator=generator,
+            reference_encoder=self.hair_encoder,
+            ref_image=reference_image,
+        ).samples
+        return sample, source_image, reference_image
+    def get_bald(self, id_image, scale):
+        H, W = id_image.size
+        scale = float(scale)
+        image = self.remove_hair_pipeline(
+            prompt="",
+            negative_prompt="",
+            num_inference_steps=30,
+            guidance_scale=1.5,
+            width=W,
+            height=H,
+            image=id_image,
+            controlnet_conditioning_scale=scale,
+            generator=None,
+        ).images[0]
+        return image
+model = StableHair(config="./configs/hair_transfer.yaml", weight_dtype=torch.float16)
+# Define your ML model or function here
+def model_call(id_image, ref_hair, converter_scale, scale, guidance_scale, controlnet_conditioning_scale):
+    # # Your ML logic goes here
+    id_image = Image.fromarray(id_image.astype('uint8'), 'RGB')
+    ref_hair = Image.fromarray(ref_hair.astype('uint8'), 'RGB')
+    id_image = id_image.resize((512, 512))
+    ref_hair = ref_hair.resize((512, 512))
+    id_image_bald = model.get_bald(id_image, converter_scale)
+    id_image_bald = np.array(id_image_bald)
+    ref_hair = np.array(ref_hair)
+    image, source_image, reference_image = model.Hair_Transfer(source_image=id_image_bald,
+                                                               reference_image=ref_hair,
+                                                               random_seed=-1,
+                                                               step=30,
+                                                               guidance_scale=guidance_scale,
+                                                               scale=scale,
+                                                               controlnet_conditioning_scale=controlnet_conditioning_scale
+                                                               )
+    image = Image.fromarray((image * 255.).astype(np.uint8))
+    return id_image_bald, image
+# Create a Gradio interface
+image1 = gr.inputs.Image(label="id_image")
+image2 = gr.inputs.Image(label="ref_hair")
+number0 = gr.inputs.Slider(minimum=0.5, maximum=1.5, default=1, label="Converter Scale")
+number1 = gr.inputs.Slider(minimum=0.0, maximum=3, default=1.0, label="Hair Encoder Scale")
+number2 = gr.inputs.Slider(minimum=1.1, maximum=3.0, default=1.5, label="CFG")
+number3 = gr.inputs.Slider(minimum=0.1, maximum=2.0, default=1, label="Latent IdentityNet Scale")
+output1 = gr.outputs.Image(type="pil", label="Bald_Result")
+output2 = gr.outputs.Image(type="pil", label="Transfer Result")
+iface = gr.Interface(
+    fn=lambda id_image, ref_hair, num0, num1, num2, num3, : model_call(id_image, ref_hair, num0, num1, num2, num3),
+    inputs=[image1, image2, number0, number1, number2, number3],
+    outputs=[output1, output2],
+    title="Hair Transfer Demo",
+    description="In general, aligned faces work well, but can also be used on non-aligned faces, and you need to resize to 512 * 512"
+)
+# Launch the Gradio interface
+iface.queue().launch(server_name='0.0.0.0', server_port=8986)

infer_full.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import gradio as gr
+import torch
+from PIL import Image
+import numpy as np
+from PIL import Image
+from omegaconf import OmegaConf
+import os
+import cv2
+from diffusers import DDIMScheduler, UniPCMultistepScheduler
+from diffusers.models import UNet2DConditionModel
+from ref_encoder.latent_controlnet import ControlNetModel
+from ref_encoder.adapter import *
+from ref_encoder.reference_unet import ref_unet
+from utils.pipeline import StableHairPipeline
+from utils.pipeline_cn import StableDiffusionControlNetPipeline
+def concatenate_images(image_files, output_file, type="pil"):
+    if type == "np":
+        image_files = [Image.fromarray(img) for img in image_files]
+    images = image_files  # list
+    max_height = max(img.height for img in images)
+    images = [img.resize((img.width, max_height)) for img in images]
+    total_width = sum(img.width for img in images)
+    combined = Image.new('RGB', (total_width, max_height))
+    x_offset = 0
+    for img in images:
+        combined.paste(img, (x_offset, 0))
+        x_offset += img.width
+    combined.save(output_file)
+class StableHair:
+    def __init__(self, config="stable_hair/configs/hair_transfer.yaml", device="cuda", weight_dtype=torch.float16) -> None:
+        print("Initializing Stable Hair Pipeline...")
+        self.config = OmegaConf.load(config)
+        self.device = device
+        ### Load controlnet
+        unet = UNet2DConditionModel.from_pretrained(self.config.pretrained_model_path, subfolder="unet").to(device)
+        controlnet = ControlNetModel.from_unet(unet).to(device)
+        _state_dict = torch.load(os.path.join(self.config.pretrained_folder, self.config.controlnet_path))
+        controlnet.load_state_dict(_state_dict, strict=False)
+        controlnet.to(weight_dtype)
+        ### >>> create pipeline >>> ###
+        self.pipeline = StableHairPipeline.from_pretrained(
+            self.config.pretrained_model_path,
+            controlnet=controlnet,
+            safety_checker=None,
+            torch_dtype=weight_dtype,
+        ).to(device)
+        self.pipeline.scheduler = UniPCMultistepScheduler.from_config(self.pipeline.scheduler.config)
+        ### load Hair encoder/adapter
+        self.hair_encoder = ref_unet.from_pretrained(self.config.pretrained_model_path, subfolder="unet").to(device)
+        _state_dict = torch.load(os.path.join(self.config.pretrained_folder, self.config.encoder_path))
+        self.hair_encoder.load_state_dict(_state_dict, strict=False)
+        self.hair_adapter = adapter_injection(self.pipeline.unet, device=self.device, dtype=torch.float16, use_resampler=False)
+        _state_dict = torch.load(os.path.join(self.config.pretrained_folder, self.config.adapter_path))
+        self.hair_adapter.load_state_dict(_state_dict, strict=False)
+        ### load bald converter
+        bald_converter = ControlNetModel.from_unet(unet).to(device)
+        _state_dict = torch.load(self.config.bald_converter_path)
+        bald_converter.load_state_dict(_state_dict, strict=False)
+        bald_converter.to(dtype=weight_dtype)
+        del unet
+        ### create pipeline for hair removal
+        self.remove_hair_pipeline = StableDiffusionControlNetPipeline.from_pretrained(
+            self.config.pretrained_model_path,
+            controlnet=bald_converter,
+            safety_checker=None,
+            torch_dtype=weight_dtype,
+        )
+        self.remove_hair_pipeline.scheduler = UniPCMultistepScheduler.from_config(
+            self.remove_hair_pipeline.scheduler.config)
+        self.remove_hair_pipeline = self.remove_hair_pipeline.to(device)
+        ### move to fp16
+        self.hair_encoder.to(weight_dtype)
+        self.hair_adapter.to(weight_dtype)
+        print("Initialization Done!")
+    def Hair_Transfer(self, source_image, reference_image, random_seed, step, guidance_scale, scale, controlnet_conditioning_scale, size=512):
+        prompt = ""
+        n_prompt = ""
+        random_seed = int(random_seed)
+        step = int(step)
+        guidance_scale = float(guidance_scale)
+        scale = float(scale)
+        # load imgs
+        source_image = Image.open(source_image).convert("RGB").resize((size, size))
+        id = np.array(source_image)
+        reference_image = np.array(Image.open(reference_image).convert("RGB").resize((size, size)))
+        source_image_bald = np.array(self.get_bald(source_image, scale=0.9))
+        H, W, C = source_image_bald.shape
+        # generate images
+        set_scale(self.pipeline.unet, scale)
+        generator = torch.Generator(device="cuda")
+        generator.manual_seed(random_seed)
+        sample = self.pipeline(
+            prompt,
+            negative_prompt=n_prompt,
+            num_inference_steps=step,
+            guidance_scale=guidance_scale,
+            width=W,
+            height=H,
+            controlnet_condition=source_image_bald,
+            controlnet_conditioning_scale=controlnet_conditioning_scale,
+            generator=generator,
+            reference_encoder=self.hair_encoder,
+            ref_image=reference_image,
+        ).samples
+        return id, sample, source_image_bald, reference_image
+    def get_bald(self, id_image, scale):
+        H, W = id_image.size
+        scale = float(scale)
+        image = self.remove_hair_pipeline(
+            prompt="",
+            negative_prompt="",
+            num_inference_steps=30,
+            guidance_scale=1.5,
+            width=W,
+            height=H,
+            image=id_image,
+            controlnet_conditioning_scale=scale,
+            generator=None,
+        ).images[0]
+        return image
+if __name__ == '__main__':
+    model = StableHair(config="./configs/hair_transfer.yaml", weight_dtype=torch.float32)
+    kwargs = OmegaConf.to_container(model.config.inference_kwargs)
+    id, image, source_image_bald, reference_image = model.Hair_Transfer(**kwargs)
+    os.makedirs(model.config.output_path, exist_ok=True)
+    output_file = os.path.join(model.config.output_path, model.config.save_name)
+    concatenate_images([id, source_image_bald, reference_image, (image*255.).astype(np.uint8)], output_file=output_file, type="np")

requirements.txt ADDED Viewed

	@@ -0,0 +1,147 @@

+absl-py==2.1.0
+accelerate==1.0.1
+addict==2.4.0
+aiofiles==23.2.1
+aiohttp==3.9.5
+aiosignal==1.3.1
+albumentations==1.4.3
+altair==5.3.0
+annotated-types==0.6.0
+antlr4-python3-runtime==4.9.3
+anyio==4.3.0
+async-timeout==4.0.3
+attrs==23.2.0
+beautifulsoup4==4.12.3
+bitsandbytes==0.44.1
+bypy==1.8.5
+certifi==2024.2.2
+charset-normalizer==3.3.2
+click==8.1.7
+contourpy==1.2.1
+cycler==0.12.1
+datasets==2.19.0
+diffusers==0.31.0
+dill==0.3.8
+distro==1.9.0
+dlib==19.24.4
+einops==0.4.1
+exceptiongroup==1.2.1
+executing==2.0.1
+fastapi==0.110.2
+ffmpy==0.3.2
+filelock==3.9.0
+fonttools==4.51.0
+frozenlist==1.4.1
+fsspec==2024.3.1
+gdown==5.2.0
+gradio==3.43.1
+gradio_client==0.5.0
+grpcio==1.64.1
+h11==0.14.0
+h5py==3.11.0
+httpcore==1.0.5
+httpx==0.27.0
+huggingface-hub==0.25.2
+idna==3.7
+imageio==2.34.1
+importlib_metadata==7.1.0
+importlib_resources==6.4.0
+Jinja2==3.1.2
+joblib==1.4.0
+jsonschema==4.21.1
+jsonschema-specifications==2023.12.1
+keras==3.3.3
+kiwisolver==1.4.5
+kornia==0.7.3
+kornia_rs==0.1.5
+lazy_loader==0.4
+logger==1.4
+Markdown==3.6
+markdown-it-py==3.0.0
+MarkupSafe==2.1.3
+matplotlib==3.8.4
+mdurl==0.1.2
+ml-dtypes==0.4.0
+mpmath==1.3.0
+mtcnn==0.1.1
+multidict==6.0.5
+multiprocess==0.70.16
+namex==0.0.8
+networkx==3.2.1
+numpy==1.26.4
+nvidia-cublas-cu11==11.11.3.6
+nvidia-cuda-cupti-cu11==11.8.87
+nvidia-cuda-nvrtc-cu11==11.8.89
+nvidia-cuda-runtime-cu11==11.8.89
+nvidia-cudnn-cu11==8.7.0.84
+nvidia-cufft-cu11==10.9.0.58
+nvidia-curand-cu11==10.3.0.86
+nvidia-cusolver-cu11==11.4.1.48
+nvidia-cusparse-cu11==11.7.5.86
+nvidia-nccl-cu11==2.19.3
+nvidia-nvtx-cu11==11.8.86
+omegaconf==2.3.0
+openai==1.31.0
+opencv-python==4.9.0.80
+opencv-python-headless==4.9.0.80
+optree==0.11.0
+orjson==3.10.1
+packaging==24.0
+pandas==2.2.2
+peft==0.11.1
+pillow==10.3.0
+prompt_toolkit==3.0.47
+protobuf==5.27.0
+pyarrow==16.0.0
+pyarrow-hotfix==0.6
+pydantic==2.7.1
+pydantic_core==2.18.2
+pydub==0.25.1
+Pygments==2.17.2
+pyparsing==3.1.2
+PySocks==1.7.1
+python-multipart==0.0.9
+pytz==2024.1
+PyYAML==6.0.1
+referencing==0.35.0
+regex==2024.4.16
+requests==2.31.0
+requests-toolbelt==1.0.0
+rich==13.7.1
+rpds-py==0.18.0
+ruff==0.4.2
+safetensors==0.4.3
+scikit-image==0.22.0
+scikit-learn==1.4.2
+scipy==1.13.0
+semantic-version==2.10.0
+sentencepiece==0.2.0
+shellingham==1.5.4
+sniffio==1.3.1
+soupsieve==2.5
+starlette==0.37.2
+sympy==1.12
+tensorboard==2.16.2
+tensorboard-data-server==0.7.2
+threadpoolctl==3.4.0
+tifffile==2024.4.24
+tokenizers==0.20.1
+tomlkit==0.12.0
+toolz==0.12.1
+torch==2.2.2+cu118
+torchvision==0.16.2+cu118
+tqdm==4.66.2
+transformers==4.45.2
+triton==2.2.0
+typer==0.12.3
+typing_extensions==4.11.0
+tzdata==2024.1
+urllib3==2.2.1
+uvicorn==0.29.0
+websocket-client==1.8.0
+websockets==11.0.3
+Werkzeug==3.0.3
+xformers==0.0.25.post1+cu118
+xxhash==3.4.1
+yarl==1.9.4
+zipp==3.18.1

train_stage1.py ADDED Viewed

	@@ -0,0 +1,715 @@

+import argparse
+import logging
+import math
+import os
+from pathlib import Path
+import itertools
+import numpy as np
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from datasets import load_dataset
+from PIL import Image
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    UNet2DConditionModel,
+    DDPMScheduler,
+    UniPCMultistepScheduler,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.utils import check_min_version, is_wandb_available
+import torch.nn.functional as F
+import albumentations as A
+import cv2
+from ref_encoder.latent_controlnet import ControlNetModel
+from utils.pipeline_cn import StableDiffusionControlNetPipeline
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.23.0")
+logger = get_logger(__name__)
+def concatenate_images(image_files, output_file, type="pil"):
+    if type == "np":
+        image_files = [Image.fromarray(img) for img in image_files]
+    images = image_files  # list
+    max_height = max(img.height for img in images)
+    images = [img.resize((img.width, max_height)) for img in images]
+    total_width = sum(img.width for img in images)
+    combined = Image.new('RGB', (total_width, max_height))
+    x_offset = 0
+    for img in images:
+        combined.paste(img, (x_offset, 0))
+        x_offset += img.width
+    combined.save(output_file)
+def image_grid(imgs, rows, cols):
+    assert len(imgs) == rows * cols
+    w, h = imgs[0].size
+    grid = Image.new("RGB", size=(cols * w, rows * h))
+    for i, img in enumerate(imgs):
+        grid.paste(img, box=(i % cols * w, i // cols * h))
+    return grid
+def log_validation(vae, text_encoder, tokenizer, unet, controlnet, args, accelerator, weight_dtype, step):
+    logger.info("Running validation... ")
+    controlnet = accelerator.unwrap_model(controlnet)
+    pipeline = StableDiffusionControlNetPipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        vae=vae,
+        text_encoder=text_encoder,
+        tokenizer=tokenizer,
+        unet=unet,
+        controlnet=controlnet,
+        safety_checker=None,
+        revision=args.revision,
+        torch_dtype=weight_dtype,
+    )
+    pipeline.scheduler = UniPCMultistepScheduler.from_config(pipeline.scheduler.config)
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+    validation_ids = args.validation_ids
+    validation_path = os.path.join(args.output_dir, "validation", f"step-{step}")
+    os.makedirs(validation_path, exist_ok=True)
+    _num = 0
+    for validation_id in validation_ids:
+        _num += 1
+        validation_id = Image.open(validation_id).convert("RGB").resize((512, 512))
+        for num in range(args.num_validation_images):
+            with torch.autocast("cuda"):
+                sample = pipeline(
+                    prompt="",
+                    negative_prompt="",
+                    num_inference_steps=30,
+                    guidance_scale=1.000001,
+                    width=512,
+                    height=512,
+                    image=validation_id,
+                    controlnet_conditioning_scale=1.,
+                    generator=None,
+                ).images[0]
+                concatenate_images([validation_id, sample],
+                                   output_file=os.path.join(validation_path, str(num)+str(_num)+".jpg"), type="pil")
+def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=revision,
+    )
+    model_class = text_encoder_config.architectures[0]
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+        return CLIPTextModel
+    elif model_class == "RobertaSeriesModelWithTransformation":
+        from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation
+        return RobertaSeriesModelWithTransformation
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of training script.")
+    parser.add_argument("--noise_offset", type=float, default=0.1, help="The scale of noise offset.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default="/share/zhangyuxuan/project/workspace/sd_model_v1-5",
+        help="Path to pretrained model or model identifier from huggingface.co/models."
+    )
+    parser.add_argument(
+        "--controlnet_model_name_or_path",
+        type=str,
+        default=None,
+        help="Path to pretrained controlnet model or model identifier from huggingface.co/models."
+        " If not specified controlnet weights are initialized from unet.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default="",
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument("--source_column", type=str, default="image")
+    parser.add_argument("--target_column", type=str, default="image")
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help=(
+            "Revision of pretrained model identifier from huggingface.co/models. Trainable model components should be"
+            " float32 precision."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="train_lr1e-5_refunet",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=1, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1000)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=1000,
+        help=(
+            "Save a checkpoint of the training state every X updates. Checkpoints can be used for resuming training via `--resume_from_checkpoint`. "
+            "In the case that the checkpoint is better than the final trained model, the checkpoint can also be used for inference."
+            "Using a checkpoint for inference requires separate loading of the original pipeline and the individual checkpointed model components."
+            "See https://huggingface.co/docs/diffusers/main/en/training/dreambooth#performing-inference-using-a-saved-checkpoint for step by step"
+            "instructions."
+        ),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-5,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=8,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="no",
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--proportion_empty_prompts",
+        type=float,
+        default=0,
+        help="Proportion of image prompts to be replaced with empty strings. Defaults to 0 (no prompt replacement).",
+    )
+    parser.add_argument(
+        "--validation_ids",
+        type=str,
+        default=["", ""],
+        nargs="+",
+        help=(
+            "A set of prompts evaluated every `--validation_steps` and logged to `--report_to`."
+            " Provide either a matching number of `--validation_image`s, a single `--validation_image`"
+            " to be used with all prompts, or a single prompt that will be used with all `--validation_image`s."
+        ),
+    )
+    parser.add_argument(
+        "--validation_hairs",
+        type=str,
+        default=["", ""],
+        nargs="+",
+        help=(
+            "A set of paths to the controlnet conditioning image be evaluated every `--validation_steps`"
+            " and logged to `--report_to`. Provide either a matching number of `--validation_prompt`s, a"
+            " a single `--validation_prompt` to be used with all `--validation_image`s, or a single"
+            " `--validation_image` that will be used with all `--validation_prompt`s."
+        ),
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=3,
+        help="Number of images to be generated for each `--validation_image`, `--validation_prompt` pair",
+    )
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=10,
+        help=(
+            "Run validation every X steps. Validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`"
+            " and logging the images."
+        ),
+    )
+    parser.add_argument(
+        "--tracker_project_name",
+        type=str,
+        default="train",
+        help=(
+            "The `project_name` argument passed to Accelerator.init_trackers for"
+            " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator"
+        ),
+    )
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+    if args.resolution % 8 != 0:
+        raise ValueError(
+            "`--resolution` must be divisible by 8 for consistently sized encoded images between the VAE and the controlnet encoder."
+        )
+    return args
+def make_train_dataset(args, tokenizer, accelerator):
+    if args.train_data_dir is not None:
+        dataset = load_dataset('json', data_files=args.train_data_dir)
+    column_names = dataset["train"].column_names
+    # 6. Get the column names for input/target.
+    if args.source_column is None:
+        source_column = column_names[1]
+        logger.info(f"source column defaulting to {source_column}")
+    else:
+        source_column = args.source_column
+        if source_column not in column_names:
+            raise ValueError(
+                f"`--source_column` value '{args.source_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+            )
+    if args.target_column is None:
+        target_column = column_names[1]
+        logger.info(f"target column defaulting to {target_column}")
+    else:
+        target_column = args.target_column
+        if target_column not in column_names:
+            raise ValueError(
+                f"`--target_column` value '{args.target_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+            )
+    norm = transforms.Normalize([0.5], [0.5])
+    to_tensor = transforms.ToTensor()
+    pixel_transform = A.Compose([
+        A.SmallestMaxSize(max_size=512),
+        A.CenterCrop(512, 512),
+        A.Affine(scale=(0.5, 1), translate_percent={"x": (-0.1, 0.1), "y": (-0.1, 0.1)}, rotate=(-10, 10), p=0.8),
+    ], additional_targets={'image0': 'image', 'image1': 'image'})
+    def imgaug(source_image, target_image):
+        source_image = cv2.resize(cv2.cvtColor(source_image, cv2.COLOR_BGR2RGB), [512, 512])
+        target_image = cv2.resize(cv2.cvtColor(target_image, cv2.COLOR_BGR2RGB), [512, 512])
+        results = pixel_transform(image=source_image, image0=target_image)
+        source_image, target_image = norm(to_tensor(results["image"]/255.)), norm(to_tensor(results["image0"]/255.))
+        return source_image, target_image
+    def preprocess_train(examples):
+        source_images = [cv2.imread(image) for image in examples[source_column]]
+        target_images = [cv2.imread(image) for image in examples[target_column]]
+        pair = [imgaug(image1, image2) for image1, image2 in zip(source_images, target_images)]
+        source_images, target_images = zip(*pair)
+        source_images_ls = list(source_images)
+        target_images_ls = list(target_images)
+        examples["source_pixel_values"] = source_images_ls
+        examples["target_pixel_values"] = target_images_ls
+        return examples
+    with accelerator.main_process_first():
+        train_dataset = dataset["train"].with_transform(preprocess_train)
+    return train_dataset
+def collate_fn(examples):
+    source_pixel_values = torch.stack([example["source_pixel_values"] for example in examples])
+    source_pixel_values = source_pixel_values.to(memory_format=torch.contiguous_format).float()
+    target_pixel_values = torch.stack([example["target_pixel_values"] for example in examples])
+    target_pixel_values = target_pixel_values.to(memory_format=torch.contiguous_format).float()
+    return {
+        "source_pixel_values": source_pixel_values,
+        "target_pixel_values": target_pixel_values,
+    }
+def main(args):
+    logging_dir = Path(args.output_dir, args.logging_dir)
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+    # Load the tokenizer
+    if args.pretrained_model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.pretrained_model_name_or_path,
+            subfolder="tokenizer",
+            revision=args.revision,
+            use_fast=False,
+        )
+    # import correct text encoder class
+    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision)
+    # Load scheduler and models
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    text_encoder = text_encoder_cls.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
+    ).to(accelerator.device)
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision).to(accelerator.device)
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision
+    ).to(accelerator.device)
+    if args.controlnet_model_name_or_path:
+        logger.info("Loading existing controlnet weights")
+        controlnet = ControlNetModel.from_pretrained(args.controlnet_model_name_or_path).to(accelerator.device)
+    else:
+        logger.info("Initializing controlnet weights from unet")
+        controlnet = ControlNetModel.from_unet(unet).to(accelerator.device)
+    vae.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+    unet.requires_grad_(False)
+    controlnet.requires_grad_(True)
+    optimizer_class = torch.optim.AdamW
+    # Optimizer creation
+    params_to_optimize = itertools.chain(controlnet.parameters())
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+    train_dataset = make_train_dataset(args, tokenizer, accelerator)
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+    # Prepare everything with our `accelerator`.
+    controlnet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        controlnet, optimizer, train_dataloader, lr_scheduler
+    )
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+    # Move vae, unet and text_encoder to device and cast to weight_dtype
+    vae.to(accelerator.device, dtype=weight_dtype)
+    unet.to(accelerator.device, dtype=weight_dtype)
+    text_encoder.to(accelerator.device, dtype=weight_dtype)
+    controlnet.to(accelerator.device, dtype=torch.float32)
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_config = dict(vars(args))
+        # tensorboard cannot handle list types for config
+        tracker_config.pop("validation_hairs")
+        tracker_config.pop("validation_ids")
+        accelerator.init_trackers(args.tracker_project_name, config=tracker_config)
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+    initial_global_step = 0
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+    null_text_inputs = tokenizer(
+        "", max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
+    ).input_ids
+    encoder_hidden_states = text_encoder(null_text_inputs.to(device=accelerator.device))[0]
+    for epoch in range(first_epoch, args.num_train_epochs):
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(controlnet):
+                # Convert images to latent space
+                latents = vae.encode(batch["target_pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+                latents = latents * vae.config.scaling_factor
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                if args.noise_offset:
+                    # https://www.crosslabs.org//blog/diffusion-with-offset-noise
+                    noise += args.noise_offset * torch.randn(
+                        (latents.shape[0], latents.shape[1], 1, 1), device=latents.device
+                    )
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+                # ref_noisy_latents = noise_scheduler.add_noise(ref_latents, noise, timesteps)
+                content_latents = vae.encode(batch["source_pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+                content_latents = content_latents * vae.config.scaling_factor
+                down_block_res_samples, mid_block_res_sample = controlnet(
+                    noisy_latents,
+                    timesteps,
+                    encoder_hidden_states=encoder_hidden_states.repeat(bsz, 1, 1),
+                    controlnet_cond=content_latents,
+                    return_dict=False,
+                )
+                # Predict the noise residual
+                model_pred = unet(
+                    noisy_latents,
+                    timesteps,
+                    encoder_hidden_states=encoder_hidden_states.repeat(bsz, 1, 1).to(dtype=weight_dtype),
+                    down_block_additional_residuals=[
+                        sample.to(dtype=weight_dtype) for sample in down_block_res_samples
+                    ],
+                    mid_block_additional_residual=mid_block_res_sample.to(dtype=weight_dtype),
+                ).sample
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+                loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+                accelerator.backward(loss)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path, safe_serialization=False)
+                        logger.info(f"Saved state to {save_path}")
+                    if args.validation_ids is not None and global_step % args.validation_steps == 0:
+                        log_validation(
+                            vae,
+                            text_encoder,
+                            tokenizer,
+                            unet,
+                            controlnet,
+                            args,
+                            accelerator,
+                            weight_dtype,
+                            global_step,
+                        )
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+            if global_step >= args.max_train_steps:
+                break
+    # Create the pipeline using using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    accelerator.end_training()
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

train_stage1.sh ADDED Viewed

	@@ -0,0 +1,21 @@

+export MODEL_DIR="runwayml/stable-diffusion-v1-5" # your SD path
+export OUTPUT_DIR="stage1"  # your save path
+export CONFIG="./default_config.yaml"
+CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch --config_file $CONFIG train_stage1.py \
+    --pretrained_model_name_or_path $MODEL_DIR \
+    --source_column="target" \
+    --target_column="source" \
+    --output_dir=$OUTPUT_DIR \
+    --logging_dir=$LOG_PATH \
+    --mixed_precision="no" \
+    --train_data_dir "data.jsonl" \  # your data.jsonl path
+    --resolution=512 \
+    --learning_rate=5e-5 \
+    --train_batch_size=16 \
+    --num_validation_images=2 \
+    --validation_ids "1.png" "2.png" \  # your validation image paths
+    --gradient_accumulation_steps=1 \
+    --num_train_epochs=500 \
+    --validation_steps=2000 \
+    --checkpointing_steps=2000

train_stage2.py ADDED Viewed

	@@ -0,0 +1,816 @@

+import argparse
+import logging
+import math
+import os
+from pathlib import Path
+import itertools
+import numpy as np
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from datasets import load_dataset
+from PIL import Image
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    UNet2DConditionModel,
+    DDPMScheduler,
+    UniPCMultistepScheduler,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.utils import check_min_version, is_wandb_available
+from utils.pipeline import StableHairPipeline
+from ref_encoder.adapter import *
+from ref_encoder.reference_control import ReferenceAttentionControl
+from ref_encoder.reference_unet import ref_unet
+from ref_encoder.latent_controlnet import ControlNetModel
+import albumentations as A
+import cv2
+import torch.nn.functional as F
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.23.0")
+logger = get_logger(__name__)
+def concatenate_images(image_files, output_file, type="pil"):
+    if type == "np":
+        image_files = [Image.fromarray(img) for img in image_files]
+    images = image_files  # list
+    max_height = max(img.height for img in images)
+    images = [img.resize((img.width, max_height)) for img in images]
+    total_width = sum(img.width for img in images)
+    combined = Image.new('RGB', (total_width, max_height))
+    x_offset = 0
+    for img in images:
+        combined.paste(img, (x_offset, 0))
+        x_offset += img.width
+    combined.save(output_file)
+def image_grid(imgs, rows, cols):
+    assert len(imgs) == rows * cols
+    w, h = imgs[0].size
+    grid = Image.new("RGB", size=(cols * w, rows * h))
+    for i, img in enumerate(imgs):
+        grid.paste(img, box=(i % cols * w, i // cols * h))
+    return grid
+def log_validation(vae, text_encoder, tokenizer, unet, controlnet, hair_encoder, args, accelerator, weight_dtype, step):
+    logger.info("Running validation... ")
+    controlnet = accelerator.unwrap_model(controlnet)
+    hair_encoder = accelerator.unwrap_model(hair_encoder)
+    pipeline = StableHairPipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        vae=vae,
+        text_encoder=text_encoder,
+        tokenizer=tokenizer,
+        unet=unet,
+        controlnet=controlnet,
+        safety_checker=None,
+        revision=args.revision,
+        torch_dtype=weight_dtype,
+    )
+    pipeline.scheduler = UniPCMultistepScheduler.from_config(pipeline.scheduler.config)
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+    validation_ids = args.validation_ids
+    validation_hairs = args.validation_hairs
+    validation_path = os.path.join(args.output_dir, "validation", f"step-{step}")
+    os.makedirs(validation_path, exist_ok=True)
+    _num = 0
+    for validation_id, validation_hair in zip(validation_ids, validation_hairs):
+        _num += 1
+        validation_id = np.array(Image.open(validation_id).convert("RGB").resize((512, 512)))
+        validation_hair = np.array(Image.open(validation_hair).convert("RGB").resize((512, 512)))
+        for num in range(args.num_validation_images):
+            with torch.autocast("cuda"):
+                sample = pipeline(
+                    prompt="",
+                    negative_prompt="",
+                    num_inference_steps=30,
+                    guidance_scale=2,
+                    width=512,
+                    height=512,
+                    controlnet_condition=validation_id,
+                    controlnet_conditioning_scale=1.,
+                    generator=None,
+                    reference_encoder=hair_encoder,
+                    ref_image=validation_hair,
+                ).samples
+                concatenate_images([validation_id, validation_hair, (sample * 255.).astype(np.uint8)],
+                                   output_file=os.path.join(validation_path, str(num)+str(_num)+".jpg"), type="np")
+def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=revision,
+    )
+    model_class = text_encoder_config.architectures[0]
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+        return CLIPTextModel
+    elif model_class == "RobertaSeriesModelWithTransformation":
+        from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation
+        return RobertaSeriesModelWithTransformation
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of training script.")
+    parser.add_argument("--noise_offset", type=float, default=0.1, help="The scale of noise offset.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default="",
+        help="Path to pretrained model or model identifier from huggingface.co/models."
+    )
+    parser.add_argument(
+        "--controlnet_model_name_or_path",
+        type=str,
+        default=None,
+        help="Path to pretrained controlnet model or model identifier from huggingface.co/models."
+        " If not specified controlnet weights are initialized from unet.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default="",
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument("--refer_column", type=str, default="reference")
+    parser.add_argument("--source_column", type=str, default="source")
+    parser.add_argument("--target_column", type=str, default="target")
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help=(
+            "Revision of pretrained model identifier from huggingface.co/models. Trainable model components should be"
+            " float32 precision."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="train_lr1e-5_refunet",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=1, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1000)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=1000,
+        help=(
+            "Save a checkpoint of the training state every X updates. Checkpoints can be used for resuming training via `--resume_from_checkpoint`. "
+            "In the case that the checkpoint is better than the final trained model, the checkpoint can also be used for inference."
+            "Using a checkpoint for inference requires separate loading of the original pipeline and the individual checkpointed model components."
+            "See https://huggingface.co/docs/diffusers/main/en/training/dreambooth#performing-inference-using-a-saved-checkpoint for step by step"
+            "instructions."
+        ),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-5,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="fp16",
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--proportion_empty_prompts",
+        type=float,
+        default=0,
+        help="Proportion of image prompts to be replaced with empty strings. Defaults to 0 (no prompt replacement).",
+    )
+    parser.add_argument(
+        "--validation_ids",
+        type=str,
+        default=["/share2/zhangyuxuan/project/stable_hair/test_imgs/ID/girl.jpg", "/share2/zhangyuxuan/project/stable_hair/test_imgs/ID/man.jpg"],
+        nargs="+",
+        help=(
+            "A set of prompts evaluated every `--validation_steps` and logged to `--report_to`."
+            " Provide either a matching number of `--validation_image`s, a single `--validation_image`"
+            " to be used with all prompts, or a single prompt that will be used with all `--validation_image`s."
+        ),
+    )
+    parser.add_argument(
+        "--validation_hairs",
+        type=str,
+        default=["", ""],
+        nargs="+",
+        help=(
+            "A set of paths to the controlnet conditioning image be evaluated every `--validation_steps`"
+            " and logged to `--report_to`. Provide either a matching number of `--validation_prompt`s, a"
+            " a single `--validation_prompt` to be used with all `--validation_image`s, or a single"
+            " `--validation_image` that will be used with all `--validation_prompt`s."
+        ),
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=3,
+        help="Number of images to be generated for each `--validation_image`, `--validation_prompt` pair",
+    )
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=1000,
+        help=(
+            "Run validation every X steps. Validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`"
+            " and logging the images."
+        ),
+    )
+    parser.add_argument(
+        "--tracker_project_name",
+        type=str,
+        default="train",
+        help=(
+            "The `project_name` argument passed to Accelerator.init_trackers for"
+            " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator"
+        ),
+    )
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+    if args.resolution % 8 != 0:
+        raise ValueError(
+            "`--resolution` must be divisible by 8 for consistently sized encoded images between the VAE and the controlnet encoder."
+        )
+    return args
+def make_train_dataset(args, tokenizer, accelerator):
+    if args.train_data_dir is not None:
+        dataset = load_dataset('json', data_files=args.train_data_dir)
+    column_names = dataset["train"].column_names
+    # 6. Get the column names for input/target.
+    if args.refer_column is None:
+        refer_column = column_names[0]
+        logger.info(f"image column defaulting to {refer_column}")
+    else:
+        refer_column = args.refer_column
+        if refer_column not in column_names:
+            raise ValueError(
+                f"`--refer_column` value '{args.refer_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+            )
+    if args.source_column is None:
+        source_column = column_names[1]
+        logger.info(f"source column defaulting to {source_column}")
+    else:
+        source_column = args.source_column
+        if source_column not in column_names:
+            raise ValueError(
+                f"`--source_column` value '{args.source_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+            )
+    if args.target_column is None:
+        target_column = column_names[1]
+        logger.info(f"target column defaulting to {target_column}")
+    else:
+        target_column = args.target_column
+        if target_column not in column_names:
+            raise ValueError(
+                f"`--target_column` value '{args.target_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+            )
+    norm = transforms.Normalize([0.5], [0.5])
+    to_tensor = transforms.ToTensor()
+    prob = 0.7
+    pixel_transform = A.Compose([
+        A.SmallestMaxSize(max_size=512),
+        A.CenterCrop(512, 512),
+        A.Affine(scale=(0.5, 1), translate_percent={"x": (-0.1, 0.1), "y": (-0.1, 0.1)}, rotate=(-10, 10), p=0.8),
+        A.OneOf(
+            [
+                A.PixelDropout(dropout_prob=0.1, p=prob),
+                A.GaussNoise(var_limit=(10.0, 50.0), mean=0, p=prob),
+                A.RandomShadow(shadow_roi=(0.1, 0.1, 0.9, 0.9), p=prob),
+            ]
+        )
+    ], additional_targets={'image0': 'image', 'image1': 'image'})
+    hair_transform = A.Compose([
+        A.SmallestMaxSize(max_size=512),
+        A.CenterCrop(512, 512),
+        A.Affine(scale=(0.9, 1.2), rotate=(-10, 10), p=0.7)]
+    )
+    def refer_imgaug(image):
+        image = cv2.resize(cv2.cvtColor(image, cv2.COLOR_BGR2RGB), [512, 512])
+        results = hair_transform(image=image)
+        image = norm(to_tensor(results["image"]/255.))
+        return image
+    def imgaug(source_image, target_image):
+        source_image = cv2.resize(cv2.cvtColor(source_image, cv2.COLOR_BGR2RGB), [512, 512])
+        target_image = cv2.resize(cv2.cvtColor(target_image, cv2.COLOR_BGR2RGB), [512, 512])
+        results = pixel_transform(image=source_image, image0=target_image)
+        source_image, target_image = norm(to_tensor(results["image"]/255.)), norm(to_tensor(results["image0"]/255.))
+        return source_image, target_image
+    def preprocess_train(examples):
+        source_images = [cv2.imread(image) for image in examples[source_column]]
+        refer_images = [cv2.imread(image) for image in examples[refer_column]]
+        target_images = [cv2.imread(image) for image in examples[target_column]]
+        pair = [imgaug(image1, image2) for image1, image2 in zip(source_images, target_images)]
+        source_images, target_images = zip(*pair)
+        source_images_ls = list(source_images)
+        target_images_ls = list(target_images)
+        refer_images_ls = [refer_imgaug(image) for image in refer_images]
+        examples["source_pixel_values"] = source_images_ls
+        examples["refer_pixel_values"] = refer_images_ls
+        examples["target_pixel_values"] = target_images_ls
+        return examples
+    with accelerator.main_process_first():
+        train_dataset = dataset["train"].with_transform(preprocess_train)
+    return train_dataset
+def collate_fn(examples):
+    source_pixel_values = torch.stack([example["source_pixel_values"] for example in examples])
+    source_pixel_values = source_pixel_values.to(memory_format=torch.contiguous_format).float()
+    refer_pixel_values = torch.stack([example["refer_pixel_values"] for example in examples])
+    refer_pixel_values = refer_pixel_values.to(memory_format=torch.contiguous_format).float()
+    target_pixel_values = torch.stack([example["target_pixel_values"] for example in examples])
+    target_pixel_values = target_pixel_values.to(memory_format=torch.contiguous_format).float()
+    return {
+        "source_pixel_values": source_pixel_values,
+        "refer_pixel_values": refer_pixel_values,
+        "target_pixel_values": target_pixel_values,
+    }
+def main(args):
+    logging_dir = Path(args.output_dir, args.logging_dir)
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+    # Load the tokenizer
+    if args.pretrained_model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.pretrained_model_name_or_path,
+            subfolder="tokenizer",
+            revision=args.revision,
+            use_fast=False,
+        )
+    # import correct text encoder class
+    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision)
+    # Load scheduler and models
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    text_encoder = text_encoder_cls.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
+    ).to(accelerator.device)
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision).to(accelerator.device)
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision
+    ).to(accelerator.device)
+    if args.controlnet_model_name_or_path:
+        logger.info("Loading existing controlnet weights")
+        controlnet = ControlNetModel.from_pretrained(args.controlnet_model_name_or_path).to(accelerator.device)
+    else:
+        logger.info("Initializing controlnet weights from unet")
+        controlnet = ControlNetModel.from_unet(unet).to(accelerator.device)
+    ### load Hair encoder/adapter/reference_control_modules
+    resume = False
+    if resume:
+        Hair_Encoder = ref_unet.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision
+        ).to(accelerator.device)
+        pretrained_folder = ""  # your checkpoint path
+        _state_dict = torch.load(os.path.join(pretrained_folder, "pytorch_model.bin"))
+        Hair_Encoder.load_state_dict(_state_dict, strict=False)
+        torch.cuda.empty_cache()
+        _state_dict = torch.load(os.path.join(pretrained_folder, "pytorch_model_1.bin"))
+        Hair_Adapter = adapter_injection(unet, dtype=torch.float32, use_resampler=False)
+        Hair_Adapter.load_state_dict(_state_dict, strict=False)
+        torch.cuda.empty_cache()
+        _state_dict = torch.load(os.path.join(pretrained_folder, "pytorch_model_2.bin"))
+        controlnet.load_state_dict(_state_dict, strict=False)
+        torch.cuda.empty_cache()
+    else:
+        Hair_Encoder = ref_unet.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision
+        ).to(accelerator.device)
+        Hair_Adapter = adapter_injection(unet, dtype=torch.float32).to(accelerator.device)
+    vae.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+    unet.requires_grad_(False)
+    Hair_Encoder.requires_grad_(True)
+    Hair_Adapter.requires_grad_(True)
+    controlnet.requires_grad_(True)
+    optimizer_class = torch.optim.AdamW
+    # Optimizer creation
+    params_to_optimize = itertools.chain(controlnet.parameters(),
+                                         Hair_Encoder.parameters(),
+                                         Hair_Adapter.parameters())
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+    train_dataset = make_train_dataset(args, tokenizer, accelerator)
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+    # Prepare everything with our `accelerator`.
+    Hair_Encoder, Hair_Adapter, controlnet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        Hair_Encoder, Hair_Adapter, controlnet, optimizer, train_dataloader, lr_scheduler
+    )
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+    # Move vae, unet and text_encoder to device and cast to weight_dtype
+    vae.to(accelerator.device, dtype=weight_dtype)
+    unet.to(accelerator.device, dtype=weight_dtype)
+    text_encoder.to(accelerator.device, dtype=weight_dtype)
+    Hair_Encoder.to(accelerator.device, dtype=torch.float32)
+    Hair_Adapter.to(accelerator.device, dtype=torch.float32)
+    controlnet.to(accelerator.device, dtype=torch.float32)
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_config = dict(vars(args))
+        # tensorboard cannot handle list types for config
+        tracker_config.pop("validation_hairs")
+        tracker_config.pop("validation_ids")
+        accelerator.init_trackers(args.tracker_project_name, config=tracker_config)
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+    initial_global_step = 0
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+    null_text_inputs = tokenizer(
+        "", max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
+    ).input_ids
+    encoder_hidden_states = text_encoder(null_text_inputs.to(device=accelerator.device))[0]
+    for epoch in range(first_epoch, args.num_train_epochs):
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(controlnet):
+                reference_control_writer_train = ReferenceAttentionControl(Hair_Encoder,
+                                                                           do_classifier_free_guidance=False,
+                                                                           mode='write', fusion_blocks='full')
+                reference_control_reader_train = ReferenceAttentionControl(unet, do_classifier_free_guidance=False,
+                                                                           mode='read',
+                                                                           fusion_blocks='full')
+                # Convert images to latent space
+                latents = vae.encode(batch["target_pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+                latents = latents * vae.config.scaling_factor
+                ref_latents = vae.encode(batch["refer_pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+                ref_latents = ref_latents * vae.config.scaling_factor
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                if args.noise_offset:
+                    # https://www.crosslabs.org//blog/diffusion-with-offset-noise
+                    noise += args.noise_offset * torch.randn(
+                        (latents.shape[0], latents.shape[1], 1, 1), device=latents.device
+                    )
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+                # ref_noisy_latents = noise_scheduler.add_noise(ref_latents, noise, timesteps)
+                controlnet_latents = vae.encode(batch["source_pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+                controlnet_latents = controlnet_latents * vae.config.scaling_factor
+                # for b in range(bsz):
+                #     max_value = torch.max(controlnet_latents[b])
+                #     min_value = torch.min(controlnet_latents[b])
+                #     controlnet_latents[b] = (controlnet_latents[b]-min_value)/(max_value-min_value)
+                down_block_res_samples, mid_block_res_sample = controlnet(
+                    noisy_latents,
+                    timesteps,
+                    encoder_hidden_states=encoder_hidden_states.repeat(bsz, 1, 1),
+                    controlnet_cond=controlnet_latents,
+                    return_dict=False,
+                )
+                # writer
+                Hair_Encoder(
+                    # ref_noisy_latents,
+                    ref_latents,
+                    timesteps,
+                    encoder_hidden_states=encoder_hidden_states.repeat(bsz, 1, 1))
+                reference_control_reader_train.update(reference_control_writer_train)
+                # Predict the noise residual
+                model_pred = unet(
+                    noisy_latents,
+                    timesteps,
+                    encoder_hidden_states=encoder_hidden_states.repeat(bsz, 1, 1).to(dtype=weight_dtype),
+                    down_block_additional_residuals=[
+                        sample.to(dtype=weight_dtype) for sample in down_block_res_samples
+                    ],
+                    mid_block_additional_residual=mid_block_res_sample.to(dtype=weight_dtype),
+                ).sample
+                # clean the reader
+                reference_control_reader_train.clear()
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+                loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+                accelerator.backward(loss)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path,  safe_serialization=False)
+                        logger.info(f"Saved state to {save_path}")
+                    if args.validation_ids is not None and global_step % args.validation_steps == 0:
+                        log_validation(
+                            vae,
+                            text_encoder,
+                            tokenizer,
+                            unet,
+                            controlnet,
+                            Hair_Encoder,
+                            args,
+                            accelerator,
+                            weight_dtype,
+                            global_step,
+                        )
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+            if global_step >= args.max_train_steps:
+                break
+    # Create the pipeline using using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    accelerator.end_training()
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

train_stage2.sh ADDED Viewed

	@@ -0,0 +1,23 @@

+export MODEL_DIR="runwayml/stable-diffusion-v1-5" # your SD path
+export OUTPUT_DIR="stage2"  # your save path
+export CONFIG="./default_config.yaml"
+CUDA_VISIBLE_DEVICES=1,2,3,4 accelerate launch --config_file $CONFIG train_stage2.py \
+    --pretrained_model_name_or_path $MODEL_DIR \
+    --refer_column="reference" \
+    --source_column="source" \
+    --target_column="target" \
+    --output_dir=$OUTPUT_DIR \
+    --logging_dir=$LOG_PATH \
+    --mixed_precision="no" \
+    --train_data_dir "your_data_jsonl_path.jsonl" \
+    --resolution=512 \
+    --learning_rate=2e-5 \
+    --train_batch_size=6 \
+    --num_validation_images=2 \
+    --validation_ids "1.jpg" "2.jpg" \
+    --validation_hairs "1.jpg" "2.jpg" \
+    --gradient_accumulation_steps=1 \
+    --num_train_epochs=100 \
+    --validation_steps=5000 \
+    --checkpointing_steps=5000