Spaces:

ironjr
/

semantic-draw-canvas-sd15

Runtime error

App Files Files Community

ironjr commited on Jun 1

Commit

30b6ccf

verified ·

1 Parent(s): 183248b

Update model.py

Browse files

Files changed (1) hide show

model.py +31 -30

model.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (c) 2024 Jaerin Lee
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -19,7 +19,7 @@
 # SOFTWARE.
 from transformers import Blip2Processor, Blip2ForConditionalGeneration
-from diffusers import DiffusionPipeline, LCMScheduler, DDIMScheduler, AutoencoderTiny
 import torch
 import torch.nn as nn
@@ -31,15 +31,15 @@ from typing import Tuple, List, Literal, Optional, Union
 from tqdm import tqdm
 from PIL import Image
-from util import gaussian_lowpass, blend, get_panorama_views, shift_to_mask_bbox_center
-class StableMultiDiffusionPipeline(nn.Module):
     def __init__(
         self,
         device: torch.device,
         dtype: torch.dtype = torch.float16,
-        sd_version: Literal['1.5', '2.0', '2.1', 'xl'] = '1.5',
         hf_key: Optional[str] = None,
         lora_key: Optional[str] = None,
         load_from_local: bool = False, # Turn on if you have already downloaed LoRA & Hugging Face hub is down.
@@ -52,8 +52,9 @@ class StableMultiDiffusionPipeline(nn.Module):
         default_preprocess_mask_cover_alpha: float = 0.3,
         t_index_list: List[int] = [0, 4, 12, 25, 37], # [0, 5, 16, 18, 20, 37], # [0, 12, 25, 37], # Magic number.
         mask_type: Literal['discrete', 'semi-continuous', 'continuous'] = 'discrete',
     ) -> None:
-        r"""Stabilized MultiDiffusion for fast sampling.
         Accelrated region-based text-to-image synthesis with Latent Consistency
         Model while preserving mask fidelity and quality.
@@ -95,13 +96,16 @@ class StableMultiDiffusionPipeline(nn.Module):
             default_preprocess_mask_cover_alpha (float): Optional preprocessing
                 where each mask covered by other masks is reduced in its alpha
                 value by this specified factor.
-            t_index_list (List[int]): The default scheduling for LCM scheduler.
             mask_type (Literal['discrete', 'semi-continuous', 'continuous']):
                 defines the mask quantization modes. Details in the codes of
                 `self.process_mask`. Basically, this (subtly) controls the
                 smoothness of foreground-background blending. More continuous
                 means more blending, but smaller generated patch depending on
                 the mask standard deviation.
         """
         super().__init__()
@@ -120,30 +124,24 @@ class StableMultiDiffusionPipeline(nn.Module):
         self.mask_type = mask_type
         print(f'[INFO] Loading Stable Diffusion...')
-        variant = None
         lora_weight_name = None
         if self.sd_version == '1.5':
             if hf_key is not None:
-                print(f'[INFO] Using Hugging Face custom model key: {hf_key}')
                 model_key = hf_key
             else:
                 model_key = 'runwayml/stable-diffusion-v1-5'
-                # variant = 'fp16'
             lora_key = 'latent-consistency/lcm-lora-sdv1-5'
             lora_weight_name = 'pytorch_lora_weights.safetensors'
-        # elif self.sd_version == 'xl':
-        #     model_key = 'stabilityai/stable-diffusion-xl-base-1.0'
-        #     lora_key = 'latent-consistency/lcm-lora-sdxl'
-        #     variant = 'fp16'
-        #     lora_weight_name = 'pytorch_lora_weights.safetensors'
         else:
             raise ValueError(f'Stable Diffusion version {self.sd_version} not supported.')
         # Create model
-        self.i2t_processor = Blip2Processor.from_pretrained('Salesforce/blip2-opt-2.7b')
-        self.i2t_model = Blip2ForConditionalGeneration.from_pretrained('Salesforce/blip2-opt-2.7b')
-        self.pipe = DiffusionPipeline.from_pretrained(model_key, variant=variant, torch_dtype=dtype).to(self.device)
         if lora_key is None:
             print(f'[INFO] LCM LoRA is not available for SD version {sd_version}. Using DDIM Scheduler instead...')
             self.pipe.scheduler = DDIMScheduler.from_config(self.pipe.scheduler.config)
@@ -166,7 +164,7 @@ class StableMultiDiffusionPipeline(nn.Module):
         self.vae_scale_factor = self.pipe.vae_scale_factor
         # Prepare white background for bootstrapping.
-        # self.get_white_background(768, 768) # This cause problem in HF ZeroGPU environment.
         print(f'[INFO] Model is loaded!')
@@ -281,11 +279,14 @@ class StableMultiDiffusionPipeline(nn.Module):
         Returns:
             A single string of text prompt.
         """
-        question = 'Question: What are in the image? Answer:'
-        inputs = self.i2t_processor(image, question, return_tensors='pt')
-        out = self.i2t_model.generate(**inputs, max_new_tokens=77)
-        prompt = self.i2t_processor.decode(out[0], skip_special_tokens=True).strip()
-        return prompt
     @torch.no_grad()
     def encode_imgs(
@@ -405,7 +406,7 @@ class StableMultiDiffusionPipeline(nn.Module):
             25, 37], the masks are split into binary masks whose values are
             greater than these levels. This results in tradual increase of mask
             region as the timesteps increase. Details are described in our
-            paper at https://arxiv.org/pdf/2403.09055.pdf.
         On the Three Modes of `mask_type`:
             `self.mask_type` is predefined at the initialization stage of this
@@ -609,7 +610,7 @@ class StableMultiDiffusionPipeline(nn.Module):
         Minimal Example:
             >>> device = torch.device('cuda:0')
-            >>> smd = StableMultiDiffusionPipeline(device)
             >>> image = smd.sample('A photo of the dolomites')
             >>> image.save('my_creation.png')
@@ -675,7 +676,7 @@ class StableMultiDiffusionPipeline(nn.Module):
         Minimal Example:
             >>> device = torch.device('cuda:0')
-            >>> smd = StableMultiDiffusionPipeline(device)
             >>> image = smd.sample_panorama(
             >>>     'A photo of Alps', height=512, width=3072)
             >>> image.save('my_panorama_creation.png')
@@ -792,7 +793,7 @@ class StableMultiDiffusionPipeline(nn.Module):
         Example:
             >>> device = torch.device('cuda:0')
-            >>> smd = StableMultiDiffusionPipeline(device)
             >>> prompts = {... specify prompts}
             >>> masks = {... specify mask tensors}
             >>> height, width = masks.shape[-2:]
@@ -881,7 +882,7 @@ class StableMultiDiffusionPipeline(nn.Module):
         # prompts is None: return background.
         # masks is None but prompts is not None: return prompts
-        # masks is not None and prompts is not None: Do StableMultiDiffusion.
         if prompts is None or (isinstance(prompts, (list, tuple, str)) and len(prompts) == 0):
             if background is None and background_prompt is not None:
@@ -1103,4 +1104,4 @@ class StableMultiDiffusionPipeline(nn.Module):
             image = blend(image, background[0], fg_mask)
         else:
             image = T.ToPILImage()(image)
-        return image

+# Copyright (c) 2025 Jaerin Lee
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # SOFTWARE.
 from transformers import Blip2Processor, Blip2ForConditionalGeneration
+from diffusers import LCMScheduler, DDIMScheduler, AutoencoderTiny
 import torch
 import torch.nn as nn
 from tqdm import tqdm
 from PIL import Image
+from util import load_model, gaussian_lowpass, blend, get_panorama_views, shift_to_mask_bbox_center
+class SemanticDrawPipeline(nn.Module):
     def __init__(
         self,
         device: torch.device,
         dtype: torch.dtype = torch.float16,
+        sd_version: Literal['1.5'] = '1.5',
         hf_key: Optional[str] = None,
         lora_key: Optional[str] = None,
         load_from_local: bool = False, # Turn on if you have already downloaed LoRA & Hugging Face hub is down.
         default_preprocess_mask_cover_alpha: float = 0.3,
         t_index_list: List[int] = [0, 4, 12, 25, 37], # [0, 5, 16, 18, 20, 37], # [0, 12, 25, 37], # Magic number.
         mask_type: Literal['discrete', 'semi-continuous', 'continuous'] = 'discrete',
+        has_i2t: bool = True,
     ) -> None:
+        r"""Stabilized regionally assigned texts-to-image generation for fast sampling.
         Accelrated region-based text-to-image synthesis with Latent Consistency
         Model while preserving mask fidelity and quality.
             default_preprocess_mask_cover_alpha (float): Optional preprocessing
                 where each mask covered by other masks is reduced in its alpha
                 value by this specified factor.
+            t_index_list (List[int]): The default scheduling for the scheduler.
             mask_type (Literal['discrete', 'semi-continuous', 'continuous']):
                 defines the mask quantization modes. Details in the codes of
                 `self.process_mask`. Basically, this (subtly) controls the
                 smoothness of foreground-background blending. More continuous
                 means more blending, but smaller generated patch depending on
                 the mask standard deviation.
+            has_i2t (bool): Automatic background image to text prompt con-
+                version with BLIP-2 model. May not be necessary for the non-
+                streaming application.
         """
         super().__init__()
         self.mask_type = mask_type
         print(f'[INFO] Loading Stable Diffusion...')
         lora_weight_name = None
         if self.sd_version == '1.5':
             if hf_key is not None:
+                print(f'[INFO] Using custom model key: {hf_key}')
                 model_key = hf_key
             else:
                 model_key = 'runwayml/stable-diffusion-v1-5'
             lora_key = 'latent-consistency/lcm-lora-sdv1-5'
             lora_weight_name = 'pytorch_lora_weights.safetensors'
         else:
             raise ValueError(f'Stable Diffusion version {self.sd_version} not supported.')
         # Create model
+        if has_i2t:
+            self.i2t_processor = Blip2Processor.from_pretrained('Salesforce/blip2-opt-2.7b')
+            self.i2t_model = Blip2ForConditionalGeneration.from_pretrained('Salesforce/blip2-opt-2.7b')
+        self.pipe = load_model(model_key, self.sd_version, self.device, self.dtype)
         if lora_key is None:
             print(f'[INFO] LCM LoRA is not available for SD version {sd_version}. Using DDIM Scheduler instead...')
             self.pipe.scheduler = DDIMScheduler.from_config(self.pipe.scheduler.config)
         self.vae_scale_factor = self.pipe.vae_scale_factor
         # Prepare white background for bootstrapping.
+        self.get_white_background(768, 768)
         print(f'[INFO] Model is loaded!')
         Returns:
             A single string of text prompt.
         """
+        if hasattr(self, 'i2t_model'):
+            question = 'Question: What are in the image? Answer:'
+            inputs = self.i2t_processor(image, question, return_tensors='pt')
+            out = self.i2t_model.generate(**inputs, max_new_tokens=77)
+            prompt = self.i2t_processor.decode(out[0], skip_special_tokens=True).strip()
+            return prompt
+        else:
+            return ''
     @torch.no_grad()
     def encode_imgs(
             25, 37], the masks are split into binary masks whose values are
             greater than these levels. This results in tradual increase of mask
             region as the timesteps increase. Details are described in our
+            paper.
         On the Three Modes of `mask_type`:
             `self.mask_type` is predefined at the initialization stage of this
         Minimal Example:
             >>> device = torch.device('cuda:0')
+            >>> smd = SemanticDrawPipeline(device)
             >>> image = smd.sample('A photo of the dolomites')
             >>> image.save('my_creation.png')
         Minimal Example:
             >>> device = torch.device('cuda:0')
+            >>> smd = SemanticDrawPipeline(device)
             >>> image = smd.sample_panorama(
             >>>     'A photo of Alps', height=512, width=3072)
             >>> image.save('my_panorama_creation.png')
         Example:
             >>> device = torch.device('cuda:0')
+            >>> smd = SemanticDrawPipeline(device)
             >>> prompts = {... specify prompts}
             >>> masks = {... specify mask tensors}
             >>> height, width = masks.shape[-2:]
         # prompts is None: return background.
         # masks is None but prompts is not None: return prompts
+        # masks is not None and prompts is not None: Do SemanticDraw.
         if prompts is None or (isinstance(prompts, (list, tuple, str)) and len(prompts) == 0):
             if background is None and background_prompt is not None:
             image = blend(image, background[0], fg_mask)
         else:
             image = T.ToPILImage()(image)
+        return image