import gradio as gr import numpy as np from diffusers import StableDiffusionXLControlNetInpaintPipeline from diffusers import StableDiffusionXLImg2ImgPipeline, DPMSolverMultistepScheduler, AutoencoderTiny, StableDiffusionXLControlNetPipeline, ControlNetModel from diffusers.utils import load_image from diffusers.image_processor import IPAdapterMaskProcessor import torch import os from transformers import CLIPVisionModelWithProjection, CLIPImageProcessor from diffusers.utils import make_image_grid from diffusers import DPMSolverSDEScheduler MAX_SEED = np.iinfo(np.int32).max MAX_IMAGE_SIZE = 1024 device = "cuda" if torch.cuda.is_available() else "cpu" if torch.cuda.is_available(): torch_dtype = torch.float16 else: torch_dtype = torch.float32 processor_mask = IPAdapterMaskProcessor() controlnets = [ ControlNetModel.from_pretrained( "diffusers/controlnet-depth-sdxl-1.0",variant="fp16",use_safetensors=True,torch_dtype=torch.float16 ), ControlNetModel.from_pretrained( "diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16, use_safetensors=True,variant="fp16" ), ] pipe_CN = StableDiffusionXLControlNetPipeline.from_pretrained("SG161222/RealVisXL_V5.0", torch_dtype=torch.float16,controlnet=[controlnets[0],controlnets[0]], use_safetensors=True, variant='fp16') ###pipe_CN.vae = AutoencoderTiny.from_pretrained("madebyollin/taesdxl", torch_dtype=torch.float16) pipe_CN.scheduler=DPMSolverSDEScheduler.from_pretrained("SG161222/RealVisXL_V5.0",subfolder="scheduler",use_karras_sigmas=True) pipe_CN.to("cuda") state_dict, network_alphas = StableDiffusionXLControlNetPipeline.lora_state_dict('CreativesCombined/hb8_cases_dreambooth_lora_test_1_14', weight_name='pytorch_lora_weights.safetensors') pipe_CN.load_lora_into_unet(state_dict, network_alphas, pipe_CN.unet, adapter_name='unet_cases') pipe_CN.load_lora_into_text_encoder(state_dict, network_alphas, pipe_CN.text_encoder, adapter_name='text_cases') pipe_CN.load_lora_into_text_encoder(state_dict, network_alphas, pipe_CN.text_encoder ,prefix='2', adapter_name='text_2_cases') pipe_CN.set_adapters(["unet_cases","text_cases","text_2_cases"], adapter_weights=[1.0, 0.5,0.5]) refiner = StableDiffusionXLImg2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-xl-refiner-1.0",text_encoder_2=pipe_CN.text_encoder_2,vae=pipe_CN.vae,torch_dtype=torch.float16,use_safetensors=True,variant="fp16") refiner.to("cuda") pipe_IN = StableDiffusionXLControlNetInpaintPipeline.from_pretrained("diffusers/stable-diffusion-xl-1.0-inpainting-0.1",controlnet=controlnets, torch_dtype=torch.float16, variant="fp16").to("cuda") pipe_IN.load_lora_weights('Tonioesparza/ourhood_training_dreambooth_lora_2_0', weight_name='pytorch_lora_weights.safetensors',adapter_name='ourhood') pipe_IN.to("cuda") def make_inpaint_condition(image, image_mask): image = np.array(image.convert("RGB")).astype(np.float32) / 255.0 image_mask = np.array(image_mask.convert("L")).astype(np.float32) / 255.0 assert image.shape[0:1] == image_mask.shape[0:1] image[image_mask > 0.5] = -1.0 # set as masked pixel image = np.expand_dims(image, 0).transpose(0, 3, 1, 2) image = torch.from_numpy(image) return image def ourhood_inference(prompt=str,num_inference_steps=int,scaffold=int,seed=int): ###pro_encode = pipe_cn.encode_text(prompt) ###pro_encode = pipe_CN.encode_text(prompt)[2] ### function has no formats defined scaff_dic={1:{'mask1':"https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_in_square_2.png", 'depth_image':"https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_depth_noroof_square.png", 'canny_image':"https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_depth_solo_square.png"}, 2:{'mask1':"https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_in_C.png", 'depth_image':"https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/depth_C.png", 'canny_image':"https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/canny_C_solo.png"}, 3:{'mask1':"https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/mask_in_B.png", 'depth_image':"https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/depth_B.png", 'canny_image':"https://huggingface.co/Tonioesparza/ourhood_training_dreambooth_lora_2_0/resolve/main/canny_B_solo.png"}} ##############################load loras ###pipe_CN.fuse_lora() output_height = 1024 output_width = 1024 mask1 = load_image(scaff_dic[scaffold]['mask1']) masks = processor_mask.preprocess([mask1], height=output_height, width=output_width) masks = [masks.reshape(1, masks.shape[0], masks.shape[2], masks.shape[3])] ###ip_images init ###ip_img_1 = load_image(r"C:\Users\AntonioEsparzaGlisma\PycharmProjects\hB8\Cases\a-place-to_210930_HAY_A-PLACE-TO_091-768x1024.png") ###ip_images = [[ip_img_1]] ###pipe_CN.set_ip_adapter_scale([[0.7]]) n_steps = num_inference_steps ###precomputed depth image depth_image = load_image(scaff_dic[scaffold]['depth_image']) canny_image = load_image(scaff_dic[scaffold]['canny_image']) masked_depth=make_inpaint_condition(depth_image,mask1) images_CN = [depth_image, canny_image] prompt1 = 'A frontpage still-life photograph, an 8-foot wooden crate, '+ prompt +' in the style of hb8 interior architecture' neg1 = 'text,watermark' prompt2 = 'Photorealistic rendering, of an OurHood privacy booth, with a silken oak frame, hickory stained melange polyester fabric, windows' neg2 = 'curtains, pillows' generator = torch.manual_seed(seed) results = pipe_CN( prompt=prompt1, ###ip_adapter_image=ip_images, negative_prompt=neg1, num_inference_steps=n_steps, num_images_per_prompt=1, generator=generator, denoising_end=0.9, image=[depth_image,masked_depth], output_type="latent", control_guidance_start=[0.0,0.5], control_guidance_end=[0.5,1.0], controlnet_conditioning_scale=[0.5,1.0], ).images[0] image = refiner( prompt=prompt1, num_inference_steps=n_steps, denoising_start=0.9, image=results).images[0] image = pipe_IN( prompt=prompt2, negative_prompt=neg2, image=image, mask_image=mask1, num_inference_steps=65, strength=1.0, control_guidance_end=[0.9,0.9], controlnet_conditioning_scale=[0.35, 0.65], control_image=images_CN, generator=generator, ).images[0] return image """ image = refiner( prompt=prompt, num_inference_steps=40, denoising_start=0.8, image=image, ).images[0] """ #@spaces.GPU #[uncomment to use ZeroGPU] examples = [ "in a British museum, pavillion, masonry, high-tables and chairs", "in a high ceilinged atrium, glass front, plantwalls, concrete floor, furniture, golden hour", "in a colorful open office environment", " in a Nordic atrium environment"] css=""" #col-container { margin: 0 auto; max-width: 640px; } """ with gr.Blocks(css=css) as demo: with gr.Column(elem_id="col-container"): gr.Markdown(f""" # HB8-Ourhood inference test """) with gr.Row(): prompt = gr.Text( label="Setting prompt", show_label=False, max_lines=1, placeholder="Where do you want to show the Ourhood pod?", container=False, ) run_button = gr.Button("Run", scale=0) result = gr.Image(label="Result", show_label=False) with gr.Accordion("Advanced Settings", open=False): perspective = gr.Slider( label="perspective", minimum=1, maximum=3, step=1, value=1, ) seed = gr.Slider( label="tracking number (seed)", minimum=0, maximum=MAX_SEED, step=1, value=0, ) with gr.Row(): num_inference_steps = gr.Slider( label="Number of inference steps", minimum=35, maximum=50, step=1, value=35, #Replace with defaults that work for your model ) gr.Examples( examples = examples, inputs = [prompt] ) gr.on( triggers=[run_button.click, prompt.submit], fn = ourhood_inference, inputs = [prompt, num_inference_steps, perspective,seed], outputs = [result] ) demo.queue().launch()