File size: 6,210 Bytes
18274c1
453ed2e
 
 
1a833ba
453ed2e
a29e3ba
00f6a78
9ad92f4
453ed2e
9ad92f4
4984c7e
 
be85eb8
 
453ed2e
e56af76
b31f6c0
be85eb8
ecc6c05
453ed2e
00f6a78
 
a29e3ba
4984c7e
be85eb8
766763f
be85eb8
 
 
ecc6c05
842563d
be85eb8
 
ecc6c05
766763f
9ad92f4
00f6a78
453ed2e
00f6a78
ee36d88
ecc6c05
4984c7e
453ed2e
7391723
a29e3ba
453ed2e
 
 
 
 
9ad92f4
 
 
842563d
 
 
 
 
9ad92f4
 
842563d
9ad92f4
 
4984c7e
842563d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4984c7e
 
842563d
 
 
 
 
 
4984c7e
86d5e88
 
 
842563d
b31f6c0
 
49ad6a5
842563d
 
 
 
 
 
 
 
 
 
 
 
b31f6c0
453ed2e
4984c7e
842563d
a29e3ba
842563d
1a833ba
7391723
842563d
a29e3ba
453ed2e
 
4984c7e
9ad92f4
 
453ed2e
4984c7e
 
 
 
 
842563d
4984c7e
842563d
 
4984c7e
 
842563d
4984c7e
 
 
 
 
 
 
 
 
c000f9c
842563d
 
 
c000f9c
 
842563d
 
 
 
 
 
 
453ed2e
 
842563d
 
 
453ed2e
842563d
 
c000f9c
842563d
 
 
453ed2e
 
842563d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import spaces
import torch
import gradio as gr
from PIL import Image
import random
from diffusers import (
    DiffusionPipeline,
    AutoencoderKL,
    StableDiffusionControlNetPipeline,
    ControlNetModel,
    StableDiffusionLatentUpscalePipeline,
    StableDiffusionImg2ImgPipeline,
    StableDiffusionControlNetImg2ImgPipeline,
    DPMSolverMultistepScheduler,
    EulerDiscreteScheduler
)
import tempfile
import time
import os
from transformers import CLIPImageProcessor

BASE_MODEL = "SG161222/Realistic_Vision_V5.1_noVAE"

# Initialize both pipelines
vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)
controlnet = ControlNetModel.from_pretrained("monster-labs/control_v1p_sd15_qrcode_monster", torch_dtype=torch.float16)

# Initialize the safety checker conditionally
SAFETY_CHECKER_ENABLED = os.environ.get("SAFETY_CHECKER", "0") == "1"
safety_checker = None
feature_extractor = None

if SAFETY_CHECKER_ENABLED:
    safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker").to("cuda")
    feature_extractor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32")

main_pipe = StableDiffusionControlNetPipeline.from_pretrained(
    BASE_MODEL,
    controlnet=controlnet,
    vae=vae,
    safety_checker=safety_checker,
    feature_extractor=feature_extractor,
    torch_dtype=torch.float16,
).to("cuda")

# Sampler map
SAMPLER_MAP = {
    "DPM++ Karras SDE": lambda config: DPMSolverMultistepScheduler.from_config(config, use_karras=True, algorithm_type="sde-dpmsolver++"),
    "Euler": lambda config: EulerDiscreteScheduler.from_config(config),
}

def center_crop_resize(img, output_size=(512, 512)):
    width, height = img.size
    new_dimension = min(width, height)
    left = (width - new_dimension) / 2
    top = (height - new_dimension) / 2
    right = (width + new_dimension) / 2
    bottom = (height + new_dimension) / 2
    
    img = img.crop((left, top, right, bottom))
    img = img.resize(output_size)
    
    return img

def common_upscale(samples, width, height, upscale_method, crop=False):
    if crop == "center":
        old_width = samples.shape[3]
        old_height = samples.shape[2]
        old_aspect = old_width / old_height
        new_aspect = width / height
        
        x = 0
        y = 0
        
        if old_aspect > new_aspect:
            x = round((old_width - old_width * (new_aspect / old_aspect)) / 2)
        elif old_aspect < new_aspect:
            y = round((old_height - old_height * (old_aspect / new_aspect)) / 2)
        
        s = samples[:, :, y:old_height - y, x:old_width - x]
    else:
        s = samples
    
    return torch.nn.functional.interpolate(s, size=(height, width), mode=upscale_method)

def upscale(samples, upscale_method, scale_by):
    width = round(samples["images"].shape[3] * scale_by)
    height = round(samples["images"].shape[2] * scale_by)
    
    s = common_upscale(samples["images"], width, height, upscale_method, "disabled")
    
    return s

def check_inputs(prompt: str, control_image: Image.Image):
    if control_image is None:
        raise gr.Error("Please select or upload an Input Illusion")
    
    if prompt is None or prompt == "":
        raise gr.Error("Prompt is required")

@spaces.GPU 
def inference(control_image: Image.Image, prompt: str, negative_prompt: str,
              guidance_scale: float = 8.0,
              controlnet_conditioning_scale: float = 1,
              control_guidance_start: float = 1,
              control_guidance_end: float = 1,
              upscaler_strength: float = 0.5,
              seed: int = -1,
              sampler="DPM++ Karras SDE",
              progress=gr.Progress(track_tqdm=True),
              profile=None):
    
    start_time = time.time()
    
    control_image_small = center_crop_resize(control_image)
    
    main_pipe.scheduler = SAMPLER_MAP[sampler](main_pipe.scheduler.config)
    
    my_seed = random.randint(0, 2**32 - 1) if seed == -1 else seed
    generator = torch.Generator(device="cuda").manual_seed(my_seed)

    out = main_pipe(
        prompt=prompt,
        negative_prompt=negative_prompt,
        image=control_image_small,
        guidance_scale=float(guidance_scale),
        controlnet_conditioning_scale=float(controlnet_conditioning_scale),
        generator=generator,
        control_guidance_start=float(control_guidance_start),
        control_guidance_end=float(control_guidance_end),
        num_inference_steps=15,
        output_type="latent"
    )
    
    upscaled_latents = upscale(out, "nearest-exact", 2)

    out_image = main_pipe(
        prompt=prompt,
        negative_prompt=negative_prompt,
        control_image=center_crop_resize(control_image, (1024, 1024)),
        image=upscaled_latents,
        guidance_scale=float(guidance_scale),
        generator=generator,
        num_inference_steps=20,
        strength=upscaler_strength,
        control_guidance_start=float(control_guidance_start),
        control_guidance_end=float(control_guidance_end),
        controlnet_conditioning_scale=float(controlnet_conditioning_scale)
    )

    end_time = time.time()
    
    # Save image + metadata logic here

with gr.Blocks() as app:
    gr.Markdown('''
      <div style="text-align: center;">
      <h1>Illusion Diffusion HQ πŸŒ€</h1>
      <p style="font-size:16px;">Generate stunning high quality illusion artwork with Stable Diffusion</p>
      </div>
      ''')
    
    with gr.Row():
        with gr.Column():
            control_image = gr.Image(label="Input Illusion", type="pil")
            prompt = gr.Textbox(label="Prompt", placeholder="Medieval village scene with busy streets and castle in the distance")
            negative_prompt = gr.Textbox(label="Negative Prompt", value="low quality")
            run_btn = gr.Button("Run")
            
            result_image = gr.Image(label="Illusion Diffusion Output", interactive=False)

            run_btn.click(check_inputs, inputs=[prompt, control_image]).success(
                inference, inputs=[control_image, prompt, negative_prompt], outputs=[result_image]
            )

if __name__ == "__main__":
    app.launch()