pi31415e2718 / app.py
skyBluezz's picture
Update app.py
003e1a1 verified
import spaces
import os
from typing import Tuple, Union, List
from PIL import Image
import numpy as np
import torch
from diffusers.pipelines.controlnet import StableDiffusionControlNetInpaintPipeline
from diffusers import ControlNetModel, UniPCMultistepScheduler, AutoPipelineForText2Image
from transformers import AutoImageProcessor, UperNetForSemanticSegmentation, \
AutoModelForDepthEstimation, DetrForObjectDetection, DetrImageProcessor
from colors import ade_palette
from utils import map_colors_rgb
from diffusers import StableDiffusionXLPipeline
import gradio as gr
import gc
device = "cuda"
dtype = torch.float16
css = """
#img-display-container {
max-height: 50vh;
}
#img-display-input {
max-height: 40vh;
}
#img-display-output {
max-height: 40vh;
}
"""
def filter_items(
colors_list: Union[List, np.ndarray],
items_list: Union[List, np.ndarray],
items_to_remove: Union[List, np.ndarray]
) -> Tuple[Union[List, np.ndarray], Union[List, np.ndarray]]:
"""
Filters items and their corresponding colors from given lists, excluding
specified items.
Args:
colors_list: A list or numpy array of colors corresponding to items.
items_list: A list or numpy array of items.
items_to_remove: A list or numpy array of items to be removed.
Returns:
A tuple of two lists or numpy arrays: filtered colors and filtered
items.
"""
filtered_colors = []
filtered_items = []
for color, item in zip(colors_list, items_list):
if item not in items_to_remove:
filtered_colors.append(color)
filtered_items.append(item)
return filtered_colors, filtered_items
def get_segmentation_pipeline(
) -> Tuple[AutoImageProcessor, UperNetForSemanticSegmentation]:
"""Method to load the segmentation pipeline
Returns:
Tuple[AutoImageProcessor, UperNetForSemanticSegmentation]: segmentation pipeline
"""
image_processor = AutoImageProcessor.from_pretrained(
"openmmlab/upernet-convnext-small"
)
image_segmentor = UperNetForSemanticSegmentation.from_pretrained(
"openmmlab/upernet-convnext-small"
)
return image_processor, image_segmentor
@torch.inference_mode()
@spaces.GPU
def segment_image(
image: Image,
image_processor: AutoImageProcessor,
image_segmentor: UperNetForSemanticSegmentation
) -> Image:
"""
Segments an image using a semantic segmentation model.
Args:
image (Image): The input image to be segmented.
image_processor (AutoImageProcessor): The processor to prepare the
image for segmentation.
image_segmentor (UperNetForSemanticSegmentation): The semantic
segmentation model used to identify different segments in the image.
Returns:
Image: The segmented image with each segment colored differently based
on its identified class.
"""
# image_processor, image_segmentor = get_segmentation_pipeline()
pixel_values = image_processor(image, return_tensors="pt").pixel_values
with torch.no_grad():
outputs = image_segmentor(pixel_values)
seg = image_processor.post_process_semantic_segmentation(
outputs, target_sizes=[image.size[::-1]])[0]
color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8)
palette = np.array(ade_palette())
for label, color in enumerate(palette):
color_seg[seg == label, :] = color
color_seg = color_seg.astype(np.uint8)
seg_image = Image.fromarray(color_seg).convert('RGB')
return seg_image
def get_depth_pipeline():
feature_extractor = AutoImageProcessor.from_pretrained("LiheYoung/depth-anything-large-hf",
torch_dtype=dtype)
depth_estimator = AutoModelForDepthEstimation.from_pretrained("LiheYoung/depth-anything-large-hf",
torch_dtype=dtype)
return feature_extractor, depth_estimator
@torch.inference_mode()
@spaces.GPU
def get_depth_image(
image: Image,
feature_extractor: AutoImageProcessor,
depth_estimator: AutoModelForDepthEstimation
) -> Image:
image_to_depth = feature_extractor(images=image, return_tensors="pt").to(device)
with torch.no_grad():
depth_map = depth_estimator(**image_to_depth).predicted_depth
width, height = image.size
depth_map = torch.nn.functional.interpolate(
depth_map.unsqueeze(1).float(),
size=(height, width),
mode="bicubic",
align_corners=False,
)
depth_min = torch.amin(depth_map, dim=[1, 2, 3], keepdim=True)
depth_max = torch.amax(depth_map, dim=[1, 2, 3], keepdim=True)
depth_map = (depth_map - depth_min) / (depth_max - depth_min)
image = torch.cat([depth_map] * 3, dim=1)
image = image.permute(0, 2, 3, 1).cpu().numpy()[0]
image = Image.fromarray((image * 255.0).clip(0, 255).astype(np.uint8))
return image
@torch.inference_mode()
@spaces.GPU
def run_detr(image:Image, detr_processor, detr_model, confidence_threshold: float = 0.8):
target = {'image_id': 0, 'annotations': []}
encoding = detr_processor(images=image, annotations=target, return_tensors="pt")
pixel_values = encoding["pixel_values"].to(device)
outputs = detr_model(pixel_values=pixel_values, pixel_mask=None)
width, height = image.size
postprocessed_outputs = detr_processor.post_process_object_detection(outputs,
target_sizes=[(height, width)],
threshold=confidence_threshold)
outputs = postprocessed_outputs[0]
# dict{scores, logits, labels, boxes}
print(f"Output dict is {outputs.keys()}: {outputs}")
ff = outputs['labels']
print(f"Labels are {ff}")
labels = [detr_model.config.id2label[label.item()] for label in outputs["labels"]]
boxes = outputs['boxes']
return boxes, labels
def resize_dimensions(dimensions, target_size):
"""
Resize PIL to target size while maintaining aspect ratio
If smaller than target size leave it as is
"""
width, height = dimensions
# Check if both dimensions are smaller than the target size
if width < target_size and height < target_size:
return dimensions
# Determine the larger side
if width > height:
# Calculate the aspect ratio
aspect_ratio = height / width
# Resize dimensions
return (target_size, int(target_size * aspect_ratio))
else:
# Calculate the aspect ratio
aspect_ratio = width / height
# Resize dimensions
return (int(target_size * aspect_ratio), target_size)
def flush():
gc.collect()
torch.cuda.empty_cache()
class ControlNetDepthDesignModelMulti:
""" Produces random noise images """
def __init__(self):
""" Initialize your model(s) here """
#os.environ['HF_HUB_OFFLINE'] = "True"
self.seed = 323*111
self.neg_prompt = "window, door, low resolution, banner, logo, watermark, text, deformed, blurry, out of focus, surreal, ugly, beginner"
self.control_items = ["windowpane;window", "door;double;door"]
self.additional_quality_suffix = "interior design, 4K, high resolution, photorealistic"
@spaces.GPU
def generate_design(self, empty_room_image: Image, prompt: str, guidance_scale: int = 10, num_steps: int = 50, strength: float =0.9, img_size: int = 640) -> Image:
"""
Given an image of an empty room and a prompt
generate the designed room according to the prompt
Inputs -
empty_room_image - An RGB PIL Image of the empty room
prompt - Text describing the target design elements of the room
Returns -
design_image - PIL Image of the same size as the empty room image
If the size is not the same the submission will fail.
"""
print(prompt)
flush()
self.generator = torch.Generator(device=device).manual_seed(self.seed)
pos_prompt = prompt + f', {self.additional_quality_suffix}'
orig_w, orig_h = empty_room_image.size
new_width, new_height = resize_dimensions(empty_room_image.size, img_size)
input_image = empty_room_image.resize((new_width, new_height))
real_seg = np.array(segment_image(input_image,
seg_image_processor,
image_segmentor))
unique_colors = np.unique(real_seg.reshape(-1, real_seg.shape[2]), axis=0)
unique_colors = [tuple(color) for color in unique_colors]
segment_items = [map_colors_rgb(i) for i in unique_colors]
chosen_colors, segment_items = filter_items(
colors_list=unique_colors,
items_list=segment_items,
items_to_remove=self.control_items
)
mask = np.zeros_like(real_seg)
for color in chosen_colors:
color_matches = (real_seg == color).all(axis=2)
mask[color_matches] = 1
image_np = np.array(input_image)
image = Image.fromarray(image_np).convert("RGB")
mask_image = Image.fromarray((mask * 255).astype(np.uint8)).convert("RGB")
segmentation_cond_image = Image.fromarray(real_seg).convert("RGB")
image_depth = get_depth_image(image, depth_feature_extractor, depth_estimator)
# generate image that would be used as IP-adapter
flush()
new_width_ip = int(new_width / 8) * 8
new_height_ip = int(new_height / 8) * 8
ip_image = guide_pipe(pos_prompt,
num_inference_steps=num_steps,
negative_prompt=self.neg_prompt,
height=new_height_ip,
width=new_width_ip,
generator=[self.generator]).images[0]
flush()
generated_image = pipe(
prompt=pos_prompt,
negative_prompt=self.neg_prompt,
num_inference_steps=num_steps,
strength=strength,
guidance_scale=guidance_scale,
generator=[self.generator],
image=image,
mask_image=mask_image,
ip_adapter_image=ip_image,
control_image=[image_depth, segmentation_cond_image],
controlnet_conditioning_scale=[0.5, 0.5]
).images[0]
flush()
design_image = generated_image.resize(
(orig_w, orig_h), Image.Resampling.LANCZOS
)
return design_image
def create_demo(model):
gr.Markdown("### demo")
with gr.Row():
with gr.Column():
input_image = gr.Image(label="Input Image", type='pil', elem_id='img-display-input')
input_text = gr.Textbox(label='Prompt', placeholder='Please upload your image first', lines=2)
with gr.Accordion('Advanced options', open=False):
num_steps = gr.Slider(label='Steps',
minimum=1,
maximum=50,
value=50,
step=1)
img_size = gr.Slider(label='Image size',
minimum=256,
maximum=768,
value=768,
step=64)
guidance_scale = gr.Slider(label='Guidance Scale',
minimum=0.1,
maximum=30.0,
value=10.0,
step=0.1)
seed = gr.Slider(label='Seed',
minimum=-1,
maximum=2147483647,
value=323*111,
step=1,
randomize=True)
strength = gr.Slider(label='Strength',
minimum=0.1,
maximum=1.0,
value=0.9,
step=0.1)
detr_confidence_threshold = gr.Slider(label='Detr_confidence_threshold',
minimum=0.1,
maximum=1.0,
value=0.8,
step=0.1)
a_prompt = gr.Textbox(
label='Added Prompt',
value="interior design, 4K, high resolution, photorealistic")
n_prompt = gr.Textbox(
label='Negative Prompt',
value="window, door, low resolution, banner, logo, watermark, text, deformed, blurry, out of focus, surreal, ugly, beginner")
submit = gr.Button("Submit")
with gr.Column():
design_image = gr.Image(label="Output Mask", elem_id='img-display-output')
with gr.Column():
bboxes = gr.JSON(label="Detected Bboxes", elem_id='json-display-output')
with gr.Column():
labels = gr.JSON(label="Detected Objects Labels", elem_id='json-display-output')
def on_submit(image, text, num_steps, guidance_scale, seed, strength, detr_confidence_threshold, a_prompt, n_prompt, img_size):
model.seed = seed
model.neg_prompt = n_prompt
model.additional_quality_suffix = a_prompt
with torch.no_grad():
out_img = model.generate_design(image, text, guidance_scale=guidance_scale, num_steps=num_steps, strength=strength, img_size=img_size)
# -----------------
# -- run detr --
# -----------------
# clear_gpu()
bboxes, labels = run_detr(out_img, detr_processor, detr_model, detr_confidence_threshold)
return out_img, bboxes.tolist(), labels
submit.click(on_submit, inputs=[input_image, input_text, num_steps, guidance_scale, seed, strength, detr_confidence_threshold, a_prompt, n_prompt, img_size], outputs=[design_image, bboxes, labels])
controlnet_depth= ControlNetModel.from_pretrained(
"controlnet_depth", torch_dtype=dtype, use_safetensors=True)
controlnet_seg = ControlNetModel.from_pretrained(
"own_controlnet", torch_dtype=dtype, use_safetensors=True)
pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(
"SG161222/Realistic_Vision_V5.1_noVAE",
#"models/runwayml--stable-diffusion-inpainting",
controlnet=[controlnet_depth, controlnet_seg],
safety_checker=None,
torch_dtype=dtype
)
pipe.load_ip_adapter("h94/IP-Adapter", subfolder="models",
weight_name="ip-adapter_sd15.bin")
pipe.set_ip_adapter_scale(0.4)
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
pipe = pipe.to(device)
guide_pipe = StableDiffusionXLPipeline.from_pretrained("segmind/SSD-1B",
torch_dtype=dtype, use_safetensors=True, variant="fp16")
guide_pipe = guide_pipe.to(device)
seg_image_processor, image_segmentor = get_segmentation_pipeline()
depth_feature_extractor, depth_estimator = get_depth_pipeline()
depth_estimator = depth_estimator.to(device)
# ---------------------------------------
# Load Detr Model
# ---------------------------------------
detr_model = DetrForObjectDetection.from_pretrained("facebook-detr-50",
# id2label={v:k for k,v in CLASS2ID.items()},
use_safetensors=True).to(device)
detr_processor = DetrImageProcessor.from_pretrained("facebook-detr-50",
use_safetensors=True)
def main():
model = ControlNetDepthDesignModelMulti()
print('Models uploaded successfully')
title = "# pi31415e2718"
description = """ Broken Code """
with gr.Blocks() as demo:
gr.Markdown(title)
gr.Markdown(description)
create_demo(model)
gr.HTML(''' ''')
demo.queue().launch(share=False)
if __name__ == '__main__':
main()