Spaces:

LTT
/

Kiss3DGen

Runtime error

App Files Files Community

JiantaoLin commited on Dec 24, 2024

Commit

ebe241c

1 Parent(s): 235efa3

new

Browse files

Files changed (2) hide show

app.py +1 -1
pipeline/kiss3d_wrapper.py +37 -40

app.py CHANGED Viewed

@@ -421,7 +421,7 @@ with gr.Blocks(css="""
                     # reconstruction_stage2_steps = gr.Number(value=50, label="reconstruction_stage2_steps")
                     btn_gen_mesh = gr.Button("Generate Mesh")
-                    output_video1 = gr.Video(label="Generated Video", interactive=False, loop=True, autoplay=True)
                     # btn_download1 = gr.Button("Download Mesh")

                     # reconstruction_stage2_steps = gr.Number(value=50, label="reconstruction_stage2_steps")
                     btn_gen_mesh = gr.Button("Generate Mesh")
+                    output_video1 = gr.Video(label="Render Video", interactive=False, loop=True, autoplay=True)
                     # btn_download1 = gr.Button("Download Mesh")

pipeline/kiss3d_wrapper.py CHANGED Viewed

@@ -74,15 +74,11 @@ def init_wrapper_from_config(config_path):
         flux_pipe = FluxImg2ImgPipeline.from_single_file(flux_base_model_pth, torch_dtype=dtype_[flux_dtype], token=access_token)
     else:
         flux_pipe = FluxImg2ImgPipeline.from_pretrained(flux_base_model_pth, torch_dtype=dtype_[flux_dtype], token=access_token)
-    # flux_pipe.enable_vae_slicing()
-    # flux_pipe.enable_vae_tiling()
-    # flux_pipe.vae = taef1
-    flux_pipe.vae.enable_slicing() # 多批次生图优化
     flux_pipe.vae.enable_tiling()
-    # flux_pipe.enable_sequential_cpu_offload()
     # load flux model and controlnet
-    if flux_controlnet_pth is not None and False:
         flux_controlnet = FluxControlNetModel.from_pretrained(flux_controlnet_pth, torch_dtype=torch.bfloat16)
         flux_pipe = convert_flux_pipeline(flux_pipe, FluxControlNetImg2ImgPipeline, controlnet=[flux_controlnet])
@@ -91,57 +87,55 @@ def init_wrapper_from_config(config_path):
     # load lora weights
     flux_pipe.load_lora_weights(flux_lora_pth)
     # flux_pipe.to(device=flux_device)
-    # flux_pipe.enable_model_cpu_offload(device=flux_device)
-    # flux_pipe = None
     # load redux model
     flux_redux_pipe = None
-    if flux_redux_pth is not None and False:
         flux_redux_pipe = FluxPriorReduxPipeline.from_pretrained(flux_redux_pth, torch_dtype=torch.bfloat16, token=access_token)
         flux_redux_pipe.text_encoder = flux_pipe.text_encoder
         flux_redux_pipe.text_encoder_2 = flux_pipe.text_encoder_2
         flux_redux_pipe.tokenizer = flux_pipe.tokenizer
         flux_redux_pipe.tokenizer_2 = flux_pipe.tokenizer_2
-        flux_redux_pipe.to(device=flux_device)
     # logger.warning(f"GPU memory allocated after load flux model on {flux_device}: {torch.cuda.memory_allocated(device=flux_device) / 1024**3} GB")
     # TODO: load pulid model
     # init multiview model
-    # logger.info('==> Loading multiview diffusion model ...')
-    # multiview_device = config_['multiview'].get('device', 'cpu')
-    # multiview_pipeline = DiffusionPipeline.from_pretrained(
-    #     config_['multiview']['base_model'],
-    #     custom_pipeline=config_['multiview']['custom_pipeline'],
-    #     torch_dtype=torch.float16,
-    # )
-    # multiview_pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(
-    #     multiview_pipeline.scheduler.config, timestep_spacing='trailing'
-    # )
-    # # unet_ckpt_path = hf_hub_download(repo_id="LTT/Kiss3DGen", filename="flexgen_19w.ckpt", repo_type="model", token=access_token)
-    # unet_ckpt_path = hf_hub_download(repo_id="LTT/Kiss3DGen", filename="flexgen.ckpt", repo_type="model", token=access_token)
-    # if unet_ckpt_path is not None:
-    #     state_dict = torch.load(unet_ckpt_path, map_location='cpu')
-    #     # state_dict = {k[10:]: v for k, v in state_dict.items() if k.startswith('unet.unet.')}
-    #     multiview_pipeline.unet.load_state_dict(state_dict, strict=True)
     # multiview_pipeline.to(multiview_device)
     # logger.warning(f"GPU memory allocated after load multiview model on {multiview_device}: {torch.cuda.memory_allocated(device=multiview_device) / 1024**3} GB")
-    multiview_pipeline = None
     # load caption model
-    # logger.info('==> Loading caption model ...')
-    # caption_device = config_['caption'].get('device', 'cpu')
-    # caption_model = AutoModelForCausalLM.from_pretrained(config_['caption']['base_model'], \
-    #                 torch_dtype=torch.bfloat16, trust_remote_code=True).to(caption_device)
-    # caption_processor = AutoProcessor.from_pretrained(config_['caption']['base_model'], trust_remote_code=True)
     # logger.warning(f"GPU memory allocated after load caption model on {caption_device}: {torch.cuda.memory_allocated(device=caption_device) / 1024**3} GB")
-    caption_processor = None
-    caption_model = None
     # load reconstruction model
     logger.info('==> Loading reconstruction model ...')
@@ -156,8 +150,7 @@ def init_wrapper_from_config(config_path):
     recon_model.to(recon_device)
     recon_model.eval()
     # logger.warning(f"GPU memory allocated after load reconstruction model on {recon_device}: {torch.cuda.memory_allocated(device=recon_device) / 1024**3} GB")
-    # recon_model = None
-    # recon_model_config = None
     # load llm
     llm_configs = config_.get('llm', None)
     if llm_configs is not None:
@@ -242,7 +235,7 @@ class kiss3d_wrapper(object):
         """
         torch_dtype = torch.bfloat16
         caption_device = self.config['caption'].get('device', 'cpu')
         if isinstance(image, str):  # If image is a file path
             image = preprocess_input_image(Image.open(image))
         elif not isinstance(image, Image.Image):
@@ -264,7 +257,7 @@ class kiss3d_wrapper(object):
         logger.info(f"Auto caption result: \"{caption_text}\"")
         caption_text = self.get_detailed_prompt(caption_text)
         return caption_text
     # @spaces.GPU
     def get_detailed_prompt(self, prompt, seed=None):
@@ -290,7 +283,7 @@ class kiss3d_wrapper(object):
     def generate_multiview(self, image, seed=None, num_inference_steps=None):
         seed = seed or self.config['multiview'].get('seed', 0)
         mv_device = self.config['multiview'].get('device', 'cpu')
         generator = torch.Generator(device=mv_device).manual_seed(seed)
         with self.context():
             mv_image = self.multiview_pipeline(image,
@@ -298,6 +291,7 @@ class kiss3d_wrapper(object):
                                                width=512*2,
                                                height=512*2,
                                                generator=generator).images[0]
         return mv_image
     def reconstruct_from_multiview(self, mv_image, lrm_render_radius=4.15):
@@ -375,6 +369,7 @@ class kiss3d_wrapper(object):
         } # for https://huggingface.co/InstantX/FLUX.1-dev-Controlnet-Union only
         flux_device = self.config['flux'].get('device', 'cpu')
         seed = seed or self.config['flux'].get('seed', 0)
         num_inference_steps = num_inference_steps or self.config['flux'].get('num_inference_steps', 20)
@@ -401,6 +396,7 @@ class kiss3d_wrapper(object):
         # do redux
         if redux_hparam is not None:
             assert self.flux_redux_pipeline is not None
             assert 'image' in redux_hparam.keys()
             redux_hparam_ = {
@@ -413,6 +409,7 @@ class kiss3d_wrapper(object):
                 redux_output = self.flux_redux_pipeline(**redux_hparam_)
             hparam_dict.update(redux_output)
          # append controlnet hparams
         if len(control_image) > 0:
@@ -442,7 +439,7 @@ class kiss3d_wrapper(object):
             torchvision.utils.save_image(gen_3d_bundle_image_, save_path)
             logger.info(f"Save generated 3D bundle image to {save_path}")
             return gen_3d_bundle_image_, save_path
         return gen_3d_bundle_image_
     def preprocess_controlnet_cond_image(self, image, control_mode, save_intermediate_results=True, **kwargs):

         flux_pipe = FluxImg2ImgPipeline.from_single_file(flux_base_model_pth, torch_dtype=dtype_[flux_dtype], token=access_token)
     else:
         flux_pipe = FluxImg2ImgPipeline.from_pretrained(flux_base_model_pth, torch_dtype=dtype_[flux_dtype], token=access_token)
+    flux_pipe.vae.enable_slicing()
     flux_pipe.vae.enable_tiling()
     # load flux model and controlnet
+    if flux_controlnet_pth is not None:
         flux_controlnet = FluxControlNetModel.from_pretrained(flux_controlnet_pth, torch_dtype=torch.bfloat16)
         flux_pipe = convert_flux_pipeline(flux_pipe, FluxControlNetImg2ImgPipeline, controlnet=[flux_controlnet])
     # load lora weights
     flux_pipe.load_lora_weights(flux_lora_pth)
     # flux_pipe.to(device=flux_device)
     # load redux model
     flux_redux_pipe = None
+    if flux_redux_pth is not None:
         flux_redux_pipe = FluxPriorReduxPipeline.from_pretrained(flux_redux_pth, torch_dtype=torch.bfloat16, token=access_token)
         flux_redux_pipe.text_encoder = flux_pipe.text_encoder
         flux_redux_pipe.text_encoder_2 = flux_pipe.text_encoder_2
         flux_redux_pipe.tokenizer = flux_pipe.tokenizer
         flux_redux_pipe.tokenizer_2 = flux_pipe.tokenizer_2
+        # flux_redux_pipe.to(device=flux_device)
     # logger.warning(f"GPU memory allocated after load flux model on {flux_device}: {torch.cuda.memory_allocated(device=flux_device) / 1024**3} GB")
     # TODO: load pulid model
     # init multiview model
+    logger.info('==> Loading multiview diffusion model ...')
+    multiview_device = config_['multiview'].get('device', 'cpu')
+    multiview_pipeline = DiffusionPipeline.from_pretrained(
+        config_['multiview']['base_model'],
+        custom_pipeline=config_['multiview']['custom_pipeline'],
+        torch_dtype=torch.float16,
+    )
+    multiview_pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(
+        multiview_pipeline.scheduler.config, timestep_spacing='trailing'
+    )
+    # unet_ckpt_path = hf_hub_download(repo_id="LTT/Kiss3DGen", filename="flexgen_19w.ckpt", repo_type="model", token=access_token)
+    unet_ckpt_path = hf_hub_download(repo_id="LTT/Kiss3DGen", filename="flexgen.ckpt", repo_type="model", token=access_token)
+    if unet_ckpt_path is not None:
+        state_dict = torch.load(unet_ckpt_path, map_location='cpu')
+        # state_dict = {k[10:]: v for k, v in state_dict.items() if k.startswith('unet.unet.')}
+        multiview_pipeline.unet.load_state_dict(state_dict, strict=True)
     # multiview_pipeline.to(multiview_device)
     # logger.warning(f"GPU memory allocated after load multiview model on {multiview_device}: {torch.cuda.memory_allocated(device=multiview_device) / 1024**3} GB")
+    # multiview_pipeline = None
     # load caption model
+    logger.info('==> Loading caption model ...')
+    caption_device = config_['caption'].get('device', 'cpu')
+    caption_model = AutoModelForCausalLM.from_pretrained(config_['caption']['base_model'], \
+                    torch_dtype=torch.bfloat16, trust_remote_code=True)
+    caption_processor = AutoProcessor.from_pretrained(config_['caption']['base_model'], trust_remote_code=True)
     # logger.warning(f"GPU memory allocated after load caption model on {caption_device}: {torch.cuda.memory_allocated(device=caption_device) / 1024**3} GB")
+    # caption_processor = None
+    # caption_model = None
     # load reconstruction model
     logger.info('==> Loading reconstruction model ...')
     recon_model.to(recon_device)
     recon_model.eval()
     # logger.warning(f"GPU memory allocated after load reconstruction model on {recon_device}: {torch.cuda.memory_allocated(device=recon_device) / 1024**3} GB")
     # load llm
     llm_configs = config_.get('llm', None)
     if llm_configs is not None:
         """
         torch_dtype = torch.bfloat16
         caption_device = self.config['caption'].get('device', 'cpu')
+        self.caption_model.to(caption_device)
         if isinstance(image, str):  # If image is a file path
             image = preprocess_input_image(Image.open(image))
         elif not isinstance(image, Image.Image):
         logger.info(f"Auto caption result: \"{caption_text}\"")
         caption_text = self.get_detailed_prompt(caption_text)
+        self.caption_model.to('cpu')
         return caption_text
     # @spaces.GPU
     def get_detailed_prompt(self, prompt, seed=None):
     def generate_multiview(self, image, seed=None, num_inference_steps=None):
         seed = seed or self.config['multiview'].get('seed', 0)
         mv_device = self.config['multiview'].get('device', 'cpu')
+        self.multiview_pipeline.to(mv_device)
         generator = torch.Generator(device=mv_device).manual_seed(seed)
         with self.context():
             mv_image = self.multiview_pipeline(image,
                                                width=512*2,
                                                height=512*2,
                                                generator=generator).images[0]
+        self.multiview_pipeline.to('cpu')
         return mv_image
     def reconstruct_from_multiview(self, mv_image, lrm_render_radius=4.15):
         } # for https://huggingface.co/InstantX/FLUX.1-dev-Controlnet-Union only
         flux_device = self.config['flux'].get('device', 'cpu')
+        self.flux_pipeline.to(flux_device)
         seed = seed or self.config['flux'].get('seed', 0)
         num_inference_steps = num_inference_steps or self.config['flux'].get('num_inference_steps', 20)
         # do redux
         if redux_hparam is not None:
+            self.flux_redux_pipeline.to(flux_device)
             assert self.flux_redux_pipeline is not None
             assert 'image' in redux_hparam.keys()
             redux_hparam_ = {
                 redux_output = self.flux_redux_pipeline(**redux_hparam_)
             hparam_dict.update(redux_output)
+            self.flux_redux_pipeline.to('cpu')
          # append controlnet hparams
         if len(control_image) > 0:
             torchvision.utils.save_image(gen_3d_bundle_image_, save_path)
             logger.info(f"Save generated 3D bundle image to {save_path}")
             return gen_3d_bundle_image_, save_path
+        self.flux_pipeline.to('cpu')
         return gen_3d_bundle_image_
     def preprocess_controlnet_cond_image(self, image, control_mode, save_intermediate_results=True, **kwargs):