ProductPlacement / gradio_demo.py
Ashoka74's picture
Update gradio_demo.py
eef85c0 verified
import spaces
import os
import math
import gradio as gr
import numpy as np
import torch
import safetensors.torch as sf
import db_examples
import datetime
from pathlib import Path
from io import BytesIO
import hydra
from hydra import initialize, compose
from omegaconf import DictConfig
from PIL import Image
from diffusers import StableDiffusionPipeline, StableDiffusionImg2ImgPipeline
from diffusers import AutoencoderKL, UNet2DConditionModel, DDIMScheduler, EulerAncestralDiscreteScheduler, DPMSolverMultistepScheduler
from diffusers.models.attention_processor import AttnProcessor2_0
from transformers import CLIPTextModel, CLIPTokenizer
from briarmbg import BriaRMBG
import dds_cloudapi_sdk
from dds_cloudapi_sdk import Config, Client, TextPrompt
from dds_cloudapi_sdk.tasks.detection import DetectionTask
from dds_cloudapi_sdk.tasks import DetectionTarget
from dds_cloudapi_sdk.tasks.dinox import DinoxTask
from enum import Enum
from torch.hub import download_url_to_file
import tempfile
from sam2.build_sam import build_sam2
from sam2.sam2_image_predictor import SAM2ImagePredictor
import cv2
from typing import Optional
from Depth.depth_anything_v2.dpt import DepthAnythingV2
import httpx
client = httpx.Client(timeout=httpx.Timeout(10.0)) # Set timeout to 10 seconds
# from FLORENCE
import spaces
import supervision as sv
import torch
from PIL import Image
from utils.sam import load_sam_image_model, run_sam_inference
try:
import xformers
import xformers.ops
XFORMERS_AVAILABLE = True
print("xformers is available - Using memory efficient attention")
except ImportError:
XFORMERS_AVAILABLE = False
print("xformers not available - Using default attention")
# Memory optimizations for RTX 2070
torch.backends.cudnn.benchmark = True
if torch.cuda.is_available():
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
# Set a smaller attention slice size for RTX 2070
torch.backends.cuda.max_split_size_mb = 512
device = torch.device('cuda')
else:
device = torch.device('cpu')
# 'stablediffusionapi/realistic-vision-v51'
# 'runwayml/stable-diffusion-v1-5'
sd15_name = 'stablediffusionapi/realistic-vision-v51'
tokenizer = CLIPTokenizer.from_pretrained(sd15_name, subfolder="tokenizer")
text_encoder = CLIPTextModel.from_pretrained(sd15_name, subfolder="text_encoder")
vae = AutoencoderKL.from_pretrained(sd15_name, subfolder="vae")
unet = UNet2DConditionModel.from_pretrained(sd15_name, subfolder="unet")
rmbg = BriaRMBG.from_pretrained("briaai/RMBG-1.4")
model = DepthAnythingV2(encoder='vits', features=64, out_channels=[48, 96, 192, 384])
model.load_state_dict(torch.load('checkpoints/depth_anything_v2_vits.pth', map_location=device))
model = model.to(device)
model.eval()
SAM_IMAGE_MODEL = load_sam_image_model(device=device)
# Change UNet
with torch.no_grad():
new_conv_in = torch.nn.Conv2d(12, unet.conv_in.out_channels, unet.conv_in.kernel_size, unet.conv_in.stride, unet.conv_in.padding)
new_conv_in.weight.zero_()
new_conv_in.weight[:, :4, :, :].copy_(unet.conv_in.weight)
new_conv_in.bias = unet.conv_in.bias
unet.conv_in = new_conv_in
unet_original_forward = unet.forward
def enable_efficient_attention():
if XFORMERS_AVAILABLE:
try:
# RTX 2070 specific settings
unet.set_use_memory_efficient_attention_xformers(True)
vae.set_use_memory_efficient_attention_xformers(True)
print("Enabled xformers memory efficient attention")
except Exception as e:
print(f"Xformers error: {e}")
print("Falling back to sliced attention")
# Use sliced attention for RTX 2070
#unet.set_attention_slice_size(4)
#vae.set_attention_slice_size(4)
unet.set_attn_processor(AttnProcessor2_0())
vae.set_attn_processor(AttnProcessor2_0())
else:
# Fallback for when xformers is not available
print("Using sliced attention")
#unet.set_attention_slice_size(4)
#vae.set_attention_slice_size(4)
unet.set_attn_processor(AttnProcessor2_0())
vae.set_attn_processor(AttnProcessor2_0())
# Add memory clearing function
def clear_memory():
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.synchronize()
# Enable efficient attention
enable_efficient_attention()
def hooked_unet_forward(sample, timestep, encoder_hidden_states, **kwargs):
c_concat = kwargs['cross_attention_kwargs']['concat_conds'].to(sample)
c_concat = torch.cat([c_concat] * (sample.shape[0] // c_concat.shape[0]), dim=0)
new_sample = torch.cat([sample, c_concat], dim=1)
kwargs['cross_attention_kwargs'] = {}
return unet_original_forward(new_sample, timestep, encoder_hidden_states, **kwargs)
unet.forward = hooked_unet_forward
# Load
# Model paths
model_path = './models/iclight_sd15_fbc.safetensors'
model_path2 = './checkpoints/depth_anything_v2_vits.pth'
model_path3 = './checkpoints/sam2_hiera_large.pt'
model_path4 = './checkpoints/config.json'
model_path5 = './checkpoints/preprocessor_config.json'
model_path6 = './configs/sam2_hiera_l.yaml'
model_path7 = './mvadapter_i2mv_sdxl.safetensors'
# Base URL for the repository
BASE_URL = 'https://huggingface.co/Ashoka74/Placement/resolve/main/'
# Model URLs
model_urls = {
model_path: 'iclight_sd15_fbc.safetensors',
model_path2: 'depth_anything_v2_vits.pth',
model_path3: 'sam2_hiera_large.pt',
model_path4: 'config.json',
model_path5: 'preprocessor_config.json',
model_path6: 'sam2_hiera_l.yaml',
model_path7: 'mvadapter_i2mv_sdxl.safetensors'
}
# Ensure directories exist
def ensure_directories():
for path in model_urls.keys():
os.makedirs(os.path.dirname(path), exist_ok=True)
# Download models
def download_models():
for local_path, filename in model_urls.items():
if not os.path.exists(local_path):
try:
url = f"{BASE_URL}{filename}"
print(f"Downloading {filename}")
download_url_to_file(url, local_path)
print(f"Successfully downloaded {filename}")
except Exception as e:
print(f"Error downloading {filename}: {e}")
ensure_directories()
download_models()
# if not os.path.exists(model_path):
# download_url_to_file(url='https://huggingface.co/lllyasviel/ic-light/resolve/main/iclight_sd15_fc.safetensors', dst=model_path)
sd_offset = sf.load_file(model_path)
sd_origin = unet.state_dict()
keys = sd_origin.keys()
sd_merged = {k: sd_origin[k] + sd_offset[k] for k in sd_origin.keys()}
unet.load_state_dict(sd_merged, strict=True)
del sd_offset, sd_origin, sd_merged, keys
# Device
# device = torch.device('cuda')
# text_encoder = text_encoder.to(device=device, dtype=torch.float16)
# vae = vae.to(device=device, dtype=torch.bfloat16)
# unet = unet.to(device=device, dtype=torch.float16)
# rmbg = rmbg.to(device=device, dtype=torch.float32)
# Device and dtype setup
device = torch.device('cuda')
dtype = torch.float16 # RTX 2070 works well with float16
# Memory optimizations for RTX 2070
torch.backends.cudnn.benchmark = True
if torch.cuda.is_available():
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
# Set a very small attention slice size for RTX 2070 to avoid OOM
torch.backends.cuda.max_split_size_mb = 128
# Move models to device with consistent dtype
text_encoder = text_encoder.to(device=device, dtype=dtype)
vae = vae.to(device=device, dtype=dtype) # Changed from bfloat16 to float16
unet = unet.to(device=device, dtype=dtype)
rmbg = rmbg.to(device=device, dtype=torch.float32) # Keep this as float32
ddim_scheduler = DDIMScheduler(
num_train_timesteps=1000,
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
clip_sample=False,
set_alpha_to_one=False,
steps_offset=1,
)
euler_a_scheduler = EulerAncestralDiscreteScheduler(
num_train_timesteps=1000,
beta_start=0.00085,
beta_end=0.012,
steps_offset=1
)
dpmpp_2m_sde_karras_scheduler = DPMSolverMultistepScheduler(
num_train_timesteps=1000,
beta_start=0.00085,
beta_end=0.012,
algorithm_type="sde-dpmsolver++",
use_karras_sigmas=True,
steps_offset=1
)
# Pipelines
t2i_pipe = StableDiffusionPipeline(
vae=vae,
text_encoder=text_encoder,
tokenizer=tokenizer,
unet=unet,
scheduler=dpmpp_2m_sde_karras_scheduler,
safety_checker=None,
requires_safety_checker=False,
feature_extractor=None,
image_encoder=None
)
i2i_pipe = StableDiffusionImg2ImgPipeline(
vae=vae,
text_encoder=text_encoder,
tokenizer=tokenizer,
unet=unet,
scheduler=dpmpp_2m_sde_karras_scheduler,
safety_checker=None,
requires_safety_checker=False,
feature_extractor=None,
image_encoder=None
)
@spaces.GPU(duration=60)
@torch.inference_mode()
def encode_prompt_inner(txt: str):
max_length = tokenizer.model_max_length
chunk_length = tokenizer.model_max_length - 2
id_start = tokenizer.bos_token_id
id_end = tokenizer.eos_token_id
id_pad = id_end
def pad(x, p, i):
return x[:i] if len(x) >= i else x + [p] * (i - len(x))
tokens = tokenizer(txt, truncation=False, add_special_tokens=False)["input_ids"]
chunks = [[id_start] + tokens[i: i + chunk_length] + [id_end] for i in range(0, len(tokens), chunk_length)]
chunks = [pad(ck, id_pad, max_length) for ck in chunks]
token_ids = torch.tensor(chunks).to(device=device, dtype=torch.int64)
conds = text_encoder(token_ids).last_hidden_state
return conds
@spaces.GPU(duration=60)
@torch.inference_mode()
def encode_prompt_pair(positive_prompt, negative_prompt):
c = encode_prompt_inner(positive_prompt)
uc = encode_prompt_inner(negative_prompt)
c_len = float(len(c))
uc_len = float(len(uc))
max_count = max(c_len, uc_len)
c_repeat = int(math.ceil(max_count / c_len))
uc_repeat = int(math.ceil(max_count / uc_len))
max_chunk = max(len(c), len(uc))
c = torch.cat([c] * c_repeat, dim=0)[:max_chunk]
uc = torch.cat([uc] * uc_repeat, dim=0)[:max_chunk]
c = torch.cat([p[None, ...] for p in c], dim=1)
uc = torch.cat([p[None, ...] for p in uc], dim=1)
return c, uc
@spaces.GPU(duration=60)
@torch.inference_mode()
def pytorch2numpy(imgs, quant=True):
results = []
for x in imgs:
y = x.movedim(0, -1)
if quant:
y = y * 127.5 + 127.5
y = y.detach().float().cpu().numpy().clip(0, 255).astype(np.uint8)
else:
y = y * 0.5 + 0.5
y = y.detach().float().cpu().numpy().clip(0, 1).astype(np.float32)
results.append(y)
return results
@spaces.GPU(duration=60)
@torch.inference_mode()
def numpy2pytorch(imgs):
h = torch.from_numpy(np.stack(imgs, axis=0)).float() / 127.0 - 1.0 # so that 127 must be strictly 0.0
h = h.movedim(-1, 1)
return h
def resize_and_center_crop(image, target_width, target_height):
pil_image = Image.fromarray(image)
original_width, original_height = pil_image.size
scale_factor = max(target_width / original_width, target_height / original_height)
resized_width = int(round(original_width * scale_factor))
resized_height = int(round(original_height * scale_factor))
resized_image = pil_image.resize((resized_width, resized_height), Image.LANCZOS)
left = (resized_width - target_width) / 2
top = (resized_height - target_height) / 2
right = (resized_width + target_width) / 2
bottom = (resized_height + target_height) / 2
cropped_image = resized_image.crop((left, top, right, bottom))
return np.array(cropped_image)
def resize_without_crop(image, target_width, target_height):
pil_image = Image.fromarray(image)
resized_image = pil_image.resize((target_width, target_height), Image.LANCZOS)
return np.array(resized_image)
@spaces.GPU(duration=60)
@torch.inference_mode()
def run_rmbg(img, sigma=0.0):
# Convert RGBA to RGB if needed
if img.shape[-1] == 4:
# Use white background for alpha composition
alpha = img[..., 3:] / 255.0
rgb = img[..., :3]
white_bg = np.ones_like(rgb) * 255
img = (rgb * alpha + white_bg * (1 - alpha)).astype(np.uint8)
H, W, C = img.shape
assert C == 3
k = (256.0 / float(H * W)) ** 0.5
feed = resize_without_crop(img, int(64 * round(W * k)), int(64 * round(H * k)))
feed = numpy2pytorch([feed]).to(device=device, dtype=torch.float32)
alpha = rmbg(feed)[0][0]
alpha = torch.nn.functional.interpolate(alpha, size=(H, W), mode="bilinear")
alpha = alpha.movedim(1, -1)[0]
alpha = alpha.detach().float().cpu().numpy().clip(0, 1)
# Create RGBA image
rgba = np.dstack((img, alpha * 255)).astype(np.uint8)
result = 127 + (img.astype(np.float32) - 127 + sigma) * alpha
return result.clip(0, 255).astype(np.uint8), rgba
@spaces.GPU(duration=60)
@torch.inference_mode()
def process(input_fg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, lowres_denoise, bg_source):
clear_memory()
# Get input dimensions
input_height, input_width = input_fg.shape[:2]
bg_source = BGSource(bg_source)
if bg_source == BGSource.UPLOAD:
pass
elif bg_source == BGSource.UPLOAD_FLIP:
input_bg = np.fliplr(input_bg)
elif bg_source == BGSource.GREY:
input_bg = np.zeros(shape=(input_height, input_width, 3), dtype=np.uint8) + 64
elif bg_source == BGSource.LEFT:
gradient = np.linspace(255, 0, input_width)
image = np.tile(gradient, (input_height, 1))
input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
elif bg_source == BGSource.RIGHT:
gradient = np.linspace(0, 255, input_width)
image = np.tile(gradient, (input_height, 1))
input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
elif bg_source == BGSource.TOP:
gradient = np.linspace(255, 0, input_height)[:, None]
image = np.tile(gradient, (1, input_width))
input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
elif bg_source == BGSource.BOTTOM:
gradient = np.linspace(0, 255, input_height)[:, None]
image = np.tile(gradient, (1, input_width))
input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
else:
raise 'Wrong initial latent!'
rng = torch.Generator(device=device).manual_seed(int(seed))
# Use input dimensions directly
fg = resize_without_crop(input_fg, input_width, input_height)
concat_conds = numpy2pytorch([fg]).to(device=vae.device, dtype=vae.dtype)
concat_conds = vae.encode(concat_conds).latent_dist.mode() * vae.config.scaling_factor
conds, unconds = encode_prompt_pair(positive_prompt=prompt + ', ' + a_prompt, negative_prompt=n_prompt)
if input_bg is None:
latents = t2i_pipe(
prompt_embeds=conds,
negative_prompt_embeds=unconds,
width=input_width,
height=input_height,
num_inference_steps=steps,
num_images_per_prompt=num_samples,
generator=rng,
output_type='latent',
guidance_scale=cfg,
cross_attention_kwargs={'concat_conds': concat_conds},
).images.to(vae.dtype) / vae.config.scaling_factor
else:
bg = resize_without_crop(input_bg, input_width, input_height)
bg_latent = numpy2pytorch([bg]).to(device=vae.device, dtype=vae.dtype)
bg_latent = vae.encode(bg_latent).latent_dist.mode() * vae.config.scaling_factor
latents = i2i_pipe(
image=bg_latent,
strength=lowres_denoise,
prompt_embeds=conds,
negative_prompt_embeds=unconds,
width=input_width,
height=input_height,
num_inference_steps=int(round(steps / lowres_denoise)),
num_images_per_prompt=num_samples,
generator=rng,
output_type='latent',
guidance_scale=cfg,
cross_attention_kwargs={'concat_conds': concat_conds},
).images.to(vae.dtype) / vae.config.scaling_factor
pixels = vae.decode(latents).sample
pixels = pytorch2numpy(pixels)
pixels = [resize_without_crop(
image=p,
target_width=int(round(input_width * highres_scale / 64.0) * 64),
target_height=int(round(input_height * highres_scale / 64.0) * 64))
for p in pixels]
pixels = numpy2pytorch(pixels).to(device=vae.device, dtype=vae.dtype)
latents = vae.encode(pixels).latent_dist.mode() * vae.config.scaling_factor
latents = latents.to(device=unet.device, dtype=unet.dtype)
highres_height, highres_width = latents.shape[2] * 8, latents.shape[3] * 8
fg = resize_without_crop(input_fg, highres_width, highres_height)
concat_conds = numpy2pytorch([fg]).to(device=vae.device, dtype=vae.dtype)
concat_conds = vae.encode(concat_conds).latent_dist.mode() * vae.config.scaling_factor
latents = i2i_pipe(
image=latents,
strength=highres_denoise,
prompt_embeds=conds,
negative_prompt_embeds=unconds,
width=highres_width,
height=highres_height,
num_inference_steps=int(round(steps / highres_denoise)),
num_images_per_prompt=num_samples,
generator=rng,
output_type='latent',
guidance_scale=cfg,
cross_attention_kwargs={'concat_conds': concat_conds},
).images.to(vae.dtype) / vae.config.scaling_factor
pixels = vae.decode(latents).sample
pixels = pytorch2numpy(pixels)
# Resize back to input dimensions
pixels = [resize_without_crop(p, input_width, input_height) for p in pixels]
pixels = np.stack(pixels)
return pixels
@spaces.GPU(duration=60)
@torch.inference_mode()
def process_bg(input_fg, input_bg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, bg_source):
clear_memory()
bg_source = BGSource(bg_source)
if bg_source == BGSource.UPLOAD:
pass
elif bg_source == BGSource.UPLOAD_FLIP:
input_bg = np.fliplr(input_bg)
elif bg_source == BGSource.GREY:
input_bg = np.zeros(shape=(image_height, image_width, 3), dtype=np.uint8) + 64
elif bg_source == BGSource.LEFT:
gradient = np.linspace(224, 32, image_width)
image = np.tile(gradient, (image_height, 1))
input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
elif bg_source == BGSource.RIGHT:
gradient = np.linspace(32, 224, image_width)
image = np.tile(gradient, (image_height, 1))
input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
elif bg_source == BGSource.TOP:
gradient = np.linspace(224, 32, image_height)[:, None]
image = np.tile(gradient, (1, image_width))
input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
elif bg_source == BGSource.BOTTOM:
gradient = np.linspace(32, 224, image_height)[:, None]
image = np.tile(gradient, (1, image_width))
input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
else:
raise 'Wrong background source!'
rng = torch.Generator(device=device).manual_seed(seed)
fg = resize_and_center_crop(input_fg, image_width, image_height)
bg = resize_and_center_crop(input_bg, image_width, image_height)
concat_conds = numpy2pytorch([fg, bg]).to(device=vae.device, dtype=vae.dtype)
concat_conds = vae.encode(concat_conds).latent_dist.mode() * vae.config.scaling_factor
concat_conds = torch.cat([c[None, ...] for c in concat_conds], dim=1)
conds, unconds = encode_prompt_pair(positive_prompt=prompt + ', ' + a_prompt, negative_prompt=n_prompt)
latents = t2i_pipe(
prompt_embeds=conds,
negative_prompt_embeds=unconds,
width=image_width,
height=image_height,
num_inference_steps=steps,
num_images_per_prompt=num_samples,
generator=rng,
output_type='latent',
guidance_scale=cfg,
cross_attention_kwargs={'concat_conds': concat_conds},
).images.to(vae.dtype) / vae.config.scaling_factor
pixels = vae.decode(latents).sample
pixels = pytorch2numpy(pixels)
pixels = [resize_without_crop(
image=p,
target_width=int(round(image_width * highres_scale / 64.0) * 64),
target_height=int(round(image_height * highres_scale / 64.0) * 64))
for p in pixels]
pixels = numpy2pytorch(pixels).to(device=vae.device, dtype=vae.dtype)
latents = vae.encode(pixels).latent_dist.mode() * vae.config.scaling_factor
latents = latents.to(device=unet.device, dtype=unet.dtype)
image_height, image_width = latents.shape[2] * 8, latents.shape[3] * 8
fg = resize_and_center_crop(input_fg, image_width, image_height)
bg = resize_and_center_crop(input_bg, image_width, image_height)
concat_conds = numpy2pytorch([fg, bg]).to(device=vae.device, dtype=vae.dtype)
concat_conds = vae.encode(concat_conds).latent_dist.mode() * vae.config.scaling_factor
concat_conds = torch.cat([c[None, ...] for c in concat_conds], dim=1)
latents = i2i_pipe(
image=latents,
strength=highres_denoise,
prompt_embeds=conds,
negative_prompt_embeds=unconds,
width=image_width,
height=image_height,
num_inference_steps=int(round(steps / highres_denoise)),
num_images_per_prompt=num_samples,
generator=rng,
output_type='latent',
guidance_scale=cfg,
cross_attention_kwargs={'concat_conds': concat_conds},
).images.to(vae.dtype) / vae.config.scaling_factor
pixels = vae.decode(latents).sample
pixels = pytorch2numpy(pixels, quant=False)
clear_memory()
return pixels, [fg, bg]
@spaces.GPU(duration=60)
@torch.inference_mode()
def process_relight(input_fg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, lowres_denoise, bg_source):
# input_fg, matting = run_rmbg(input_fg)
results = process(input_fg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, lowres_denoise, bg_source)
return input_fg, results
@spaces.GPU(duration=60)
@torch.inference_mode()
def process_relight_bg(input_fg, input_bg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, bg_source):
bg_source = BGSource(bg_source)
# Convert numerical inputs to appropriate types
image_width = int(image_width)
image_height = int(image_height)
num_samples = int(num_samples)
seed = int(seed)
steps = int(steps)
cfg = float(cfg)
highres_scale = float(highres_scale)
highres_denoise = float(highres_denoise)
if bg_source == BGSource.UPLOAD:
pass
elif bg_source == BGSource.UPLOAD_FLIP:
input_bg = np.fliplr(input_bg)
elif bg_source == BGSource.GREY:
input_bg = np.zeros(shape=(image_height, image_width, 3), dtype=np.uint8) + 64
elif bg_source == BGSource.LEFT:
gradient = np.linspace(224, 32, image_width)
image = np.tile(gradient, (image_height, 1))
input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
elif bg_source == BGSource.RIGHT:
gradient = np.linspace(32, 224, image_width)
image = np.tile(gradient, (image_height, 1))
input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
elif bg_source == BGSource.TOP:
gradient = np.linspace(224, 32, image_height)[:, None]
image = np.tile(gradient, (1, image_width))
input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
elif bg_source == BGSource.BOTTOM:
gradient = np.linspace(32, 224, image_height)[:, None]
image = np.tile(gradient, (1, image_width))
input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
else:
raise ValueError('Wrong background source!')
input_fg, matting = run_rmbg(input_fg)
results, extra_images = process_bg(input_fg, input_bg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, bg_source)
results = [(x * 255.0).clip(0, 255).astype(np.uint8) for x in results]
final_results = results + extra_images
# Save the generated images
save_images(results, prefix="relight")
return results
quick_prompts = [
'sunshine from window',
'neon light, city',
'sunset over sea',
'golden time',
'sci-fi RGB glowing, cyberpunk',
'natural lighting',
'warm atmosphere, at home, bedroom',
'magic lit',
'evil, gothic, Yharnam',
'light and shadow',
'shadow from window',
'soft studio lighting',
'home atmosphere, cozy bedroom illumination',
'neon, Wong Kar-wai, warm'
]
quick_prompts = [[x] for x in quick_prompts]
quick_subjects = [
'modern sofa, high quality leather',
'elegant dining table, polished wood',
'luxurious bed, premium mattress',
'minimalist office desk, clean design',
'vintage wooden cabinet, antique finish',
]
quick_subjects = [[x] for x in quick_subjects]
class BGSource(Enum):
LEFT = "Left Light"
RIGHT = "Right Light"
TOP = "Top Light"
BOTTOM = "Bottom Light"
GREY = "Ambient"
UPLOAD = "Use Background Image"
UPLOAD_FLIP = "Use Flipped Background Image"
# Add save function
def save_images(images, prefix="relight"):
# Create output directory if it doesn't exist
output_dir = Path("outputs")
output_dir.mkdir(exist_ok=True)
# Create timestamp for unique filenames
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
saved_paths = []
for i, img in enumerate(images):
if isinstance(img, np.ndarray):
# Convert to PIL Image if numpy array
img = Image.fromarray(img)
# Create filename with timestamp
filename = f"{prefix}_{timestamp}_{i+1}.png"
filepath = output_dir / filename
# Save image
img.save(filepath)
# print(f"Saved {len(saved_paths)} images to {output_dir}")
return saved_paths
class MaskMover:
def __init__(self):
self.extracted_fg = None
self.original_fg = None # Store original foreground
self.current_bg = None
def set_extracted_fg(self, fg_image):
"""Store the extracted foreground with alpha channel"""
if isinstance(fg_image, np.ndarray):
self.extracted_fg = fg_image.copy()
self.original_fg = fg_image.copy()
else:
self.extracted_fg = np.array(fg_image)
self.original_fg = np.array(fg_image)
return self.extracted_fg
def update_background(self, new_bg):
"""Update the current background without affecting the mask"""
if new_bg is not None:
self.current_bg = new_bg.copy() # Overwrite the current background
print("Background updated successfully.") # Debugging
def create_composite(self, background, x_pos, y_pos, scale=1.0):
"""Create composite with foreground at specified position"""
if self.original_fg is None or self.current_bg is None:
return self.current_bg # Return current background if no foreground
# Convert inputs to PIL Images
if isinstance(background, np.ndarray):
bg = Image.fromarray(background).convert('RGBA')
else:
bg = background.convert('RGBA')
if isinstance(self.original_fg, np.ndarray):
fg = Image.fromarray(self.original_fg).convert('RGBA')
else:
fg = self.original_fg.convert('RGBA')
# Scale the foreground size
new_width = int(fg.width * scale)
new_height = int(fg.height * scale)
fg = fg.resize((new_width, new_height), Image.LANCZOS)
# Center the scaled foreground at the position
x = int(x_pos - new_width / 2)
y = int(y_pos - new_height / 2)
# Create composite
result = bg.copy()
result.paste(fg, (x, y), fg) # Use fg as the mask (requires fg to be in 'RGBA' mode)
return np.array(result.convert('RGB')) # Convert back to 'RGB' if needed
@spaces.GPU(duration=60)
@torch.inference_mode()
def get_depth(image):
if image is None:
return None
# Convert from PIL/gradio format to cv2
raw_img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
# Get depth map
depth = model.infer_image(raw_img) # HxW raw depth map
# Normalize depth for visualization
depth = ((depth - depth.min()) / (depth.max() - depth.min()) * 255).astype(np.uint8)
# Convert to RGB for display
depth_colored = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO)
depth_colored = cv2.cvtColor(depth_colored, cv2.COLOR_BGR2RGB)
return Image.fromarray(depth_colored)
from PIL import Image
def extract_foreground(image):
if image is None:
return None, gr.update(visible=True), gr.update(visible=True)
result, rgba = run_rmbg(image)
mask_mover.set_extracted_fg(rgba)
return result, gr.update(visible=True), gr.update(visible=True)
def on_background_upload(new_bg):
mask_mover.update_background(new_bg)
def compress_image(image):
# Convert Gradio image (numpy array) to PIL Image
img = Image.fromarray(image)
# Resize image if dimensions are too large
max_size = 1024 # Maximum dimension size
if img.width > max_size or img.height > max_size:
ratio = min(max_size/img.width, max_size/img.height)
new_size = (int(img.width * ratio), int(img.height * ratio))
img = img.resize(new_size, Image.Resampling.LANCZOS)
quality = 95 # Start with high quality
img.save("compressed_image.jpg", "JPEG", quality=quality) # Initial save
# Check file size and adjust quality if necessary
while os.path.getsize("compressed_image.jpg") > 100 * 1024: # 100KB limit
quality -= 5 # Decrease quality
img.save("compressed_image.jpg", "JPEG", quality=quality)
if quality < 20: # Prevent quality from going too low
break
# Convert back to numpy array for Gradio
compressed_img = np.array(Image.open("compressed_image.jpg"))
return compressed_img
# @hydra.main(config_path="/home/user/app/configs", config_name="sam2_hiera_l")
@spaces.GPU(duration=60)
@torch.inference_mode
def process_image(input_image, input_text):
"""Main processing function for the Gradio interface"""
# Initialize configs
API_TOKEN = "9c8c865e10ec1821bea79d9fa9dc8720"
SAM2_CHECKPOINT = "./checkpoints/sam2_hiera_large.pt"
SAM2_MODEL_CONFIG = os.path.join(os.path.dirname(os.path.abspath(__file__)), "configs/sam2_hiera_l.yaml")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
OUTPUT_DIR = Path("outputs/grounded_sam2_dinox_demo")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# Initialize DDS client
config = Config(API_TOKEN)
client = Client(config)
# Process classes from text prompt
classes = [x.strip().lower() for x in input_text.split('.') if x]
class_name_to_id = {name: id for id, name in enumerate(classes)}
class_id_to_name = {id: name for name, id in class_name_to_id.items()}
# Save input image to temp file and get URL
with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmpfile:
cv2.imwrite(tmpfile.name, input_image)
image_url = client.upload_file(tmpfile.name)
os.remove(tmpfile.name)
# Process detection results
input_boxes = []
masks = []
confidences = []
class_names = []
class_ids = []
if len(input_text) == 0:
task = DinoxTask(
image_url=image_url,
prompts=[TextPrompt(text="<prompt_free>")],
# targets=[DetectionTarget.BBox, DetectionTarget.Mask]
)
client.run_task(task)
predictions = task.result.objects
classes = [pred.category for pred in predictions]
classes = list(set(classes))
class_name_to_id = {name: id for id, name in enumerate(classes)}
class_id_to_name = {id: name for name, id in class_name_to_id.items()}
for idx, obj in enumerate(predictions):
input_boxes.append(obj.bbox)
masks.append(DetectionTask.rle2mask(DetectionTask.string2rle(obj.mask.counts), obj.mask.size)) # convert mask to np.array using DDS API
confidences.append(obj.score)
cls_name = obj.category.lower().strip()
class_names.append(cls_name)
class_ids.append(class_name_to_id[cls_name])
boxes = np.array(input_boxes)
masks = np.array(masks)
class_ids = np.array(class_ids)
labels = [
f"{class_name} {confidence:.2f}"
for class_name, confidence
in zip(class_names, confidences)
]
detections = sv.Detections(
xyxy=boxes,
mask=masks.astype(bool),
class_id=class_ids
)
box_annotator = sv.BoxAnnotator()
label_annotator = sv.LabelAnnotator()
mask_annotator = sv.MaskAnnotator()
annotated_frame = input_image.copy()
annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections)
annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections)
# Create transparent mask for first detected object
if len(detections) > 0:
# Get first mask
first_mask = detections.mask[0]
# Get original RGB image
img = input_image.copy()
H, W, C = img.shape
# Create RGBA image
alpha = np.zeros((H, W, 1), dtype=np.uint8)
alpha[first_mask] = 255
rgba = np.dstack((img, alpha)).astype(np.uint8)
# Crop to mask bounds to minimize image size
y_indices, x_indices = np.where(first_mask)
y_min, y_max = y_indices.min(), y_indices.max()
x_min, x_max = x_indices.min(), x_indices.max()
# Crop the RGBA image
cropped_rgba = rgba[y_min:y_max+1, x_min:x_max+1]
# Set extracted foreground for mask mover
mask_mover.set_extracted_fg(cropped_rgba)
return annotated_frame, cropped_rgba, gr.update(visible=True), gr.update(visible=True)
else:
# Run DINO-X detection
task = DinoxTask(
image_url=image_url,
prompts=[TextPrompt(text=input_text)],
targets=[DetectionTarget.BBox, DetectionTarget.Mask]
)
client.run_task(task)
result = task.result
objects = result.objects
# for obj in objects:
# input_boxes.append(obj.bbox)
# confidences.append(obj.score)
# cls_name = obj.category.lower().strip()
# class_names.append(cls_name)
# class_ids.append(class_name_to_id[cls_name])
# input_boxes = np.array(input_boxes)
# class_ids = np.array(class_ids)
predictions = task.result.objects
classes = [x.strip().lower() for x in input_text.split('.') if x]
class_name_to_id = {name: id for id, name in enumerate(classes)}
class_id_to_name = {id: name for name, id in class_name_to_id.items()}
boxes = []
masks = []
confidences = []
class_names = []
class_ids = []
for idx, obj in enumerate(predictions):
boxes.append(obj.bbox)
masks.append(DetectionTask.rle2mask(DetectionTask.string2rle(obj.mask.counts), obj.mask.size)) # convert mask to np.array using DDS API
confidences.append(obj.score)
cls_name = obj.category.lower().strip()
class_names.append(cls_name)
class_ids.append(class_name_to_id[cls_name])
boxes = np.array(boxes)
masks = np.array(masks)
class_ids = np.array(class_ids)
labels = [
f"{class_name} {confidence:.2f}"
for class_name, confidence
in zip(class_names, confidences)
]
# Initialize SAM2
# torch.autocast(device_type=DEVICE, dtype=torch.bfloat16).__enter__()
# if torch.cuda.get_device_properties(0).major >= 8:
# torch.backends.cuda.matmul.allow_tf32 = True
# torch.backends.cudnn.allow_tf32 = True
# sam2_model = build_sam2(SAM2_MODEL_CONFIG, SAM2_CHECKPOINT, device=DEVICE)
# sam2_predictor = SAM2ImagePredictor(sam2_model)
# sam2_predictor.set_image(input_image)
# sam2_predictor = run_sam_inference(SAM_IMAGE_MODEL, input_image, detections)
# Get masks from SAM2
# masks, scores, logits = sam2_predictor.predict(
# point_coords=None,
# point_labels=None,
# box=input_boxes,
# multimask_output=False,
# )
if masks.ndim == 4:
masks = masks.squeeze(1)
# Create visualization
# labels = [f"{class_name} {confidence:.2f}"
# for class_name, confidence in zip(class_names, confidences)]
# detections = sv.Detections(
# xyxy=input_boxes,
# mask=masks.astype(bool),
# class_id=class_ids
# )
detections = sv.Detections(
xyxy = boxes,
mask = masks.astype(bool),
class_id = class_ids,
)
box_annotator = sv.BoxAnnotator()
label_annotator = sv.LabelAnnotator()
mask_annotator = sv.MaskAnnotator()
annotated_frame = input_image.copy()
annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections)
annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections)
# Create transparent mask for first detected object
if len(detections) > 0:
# Get first mask
first_mask = detections.mask[0]
# Get original RGB image
img = input_image.copy()
H, W, C = img.shape
# Create RGBA image
alpha = np.zeros((H, W, 1), dtype=np.uint8)
alpha[first_mask] = 255
rgba = np.dstack((img, alpha)).astype(np.uint8)
# Crop to mask bounds to minimize image size
y_indices, x_indices = np.where(first_mask)
y_min, y_max = y_indices.min(), y_indices.max()
x_min, x_max = x_indices.min(), x_indices.max()
# Crop the RGBA image
cropped_rgba = rgba[y_min:y_max+1, x_min:x_max+1]
# Set extracted foreground for mask mover
mask_mover.set_extracted_fg(cropped_rgba)
return annotated_frame, cropped_rgba, gr.update(visible=True), gr.update(visible=True)
return annotated_frame, None, gr.update(visible=False), gr.update(visible=False)
block = gr.Blocks().queue()
with block:
with gr.Tab("Text", visible=False):
with gr.Row():
gr.Markdown("## Product Placement from Text")
with gr.Row():
with gr.Column():
with gr.Row():
input_fg = gr.Image(type="numpy", label="Image", height=480)
with gr.Row():
with gr.Group():
find_objects_button = gr.Button(value="(Option 1) Segment Object from text")
text_prompt = gr.Textbox(
label="Text Prompt",
placeholder="Enter object classes separated by periods (e.g. 'couch . table .') or leave empty to get all objects",
value=""
)
extract_button = gr.Button(value="(Option 2) Remove Background")
with gr.Row():
extracted_objects = gr.Image(type="numpy", label="Extracted Foreground", height=480)
extracted_fg = gr.Image(type="numpy", label="Extracted Foreground", height=480)
with gr.Row():
x_slider = gr.Slider(
minimum=0,
maximum=1000,
label="X Position",
value=500,
visible=False
)
y_slider = gr.Slider(
minimum=0,
maximum=1000,
label="Y Position",
value=500,
visible=False
)
# output_bg = gr.Image(type="numpy", label="Preprocessed Foreground", height=480)
with gr.Group():
prompt = gr.Textbox(label="Prompt")
bg_source = gr.Radio(choices=[e.value for e in BGSource],
value=BGSource.GREY.value,
label="Lighting Preference (Initial Latent)", type='value')
example_quick_subjects = gr.Dataset(samples=quick_subjects, label='Subject Quick List', samples_per_page=1000, components=[prompt])
example_quick_prompts = gr.Dataset(samples=quick_prompts, label='Lighting Quick List', samples_per_page=1000, components=[prompt])
relight_button = gr.Button(value="Relight")
with gr.Group():
with gr.Row():
num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
seed = gr.Number(label="Seed", value=12345, precision=0)
with gr.Row():
image_width = gr.Slider(label="Image Width", minimum=256, maximum=1024, value=512, step=64)
image_height = gr.Slider(label="Image Height", minimum=256, maximum=1024, value=640, step=64)
with gr.Accordion("Advanced options", open=False):
steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=15, step=1)
cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=2, step=0.01)
lowres_denoise = gr.Slider(label="Lowres Denoise (for initial latent)", minimum=0.1, maximum=1.0, value=0.9, step=0.01)
highres_scale = gr.Slider(label="Highres Scale", minimum=1.0, maximum=3.0, value=1.5, step=0.01)
highres_denoise = gr.Slider(label="Highres Denoise", minimum=0.1, maximum=1.0, value=0.5, step=0.01)
a_prompt = gr.Textbox(label="Added Prompt", value='best quality')
n_prompt = gr.Textbox(label="Negative Prompt", value='lowres, bad anatomy, bad hands, cropped, worst quality')
with gr.Column():
result_gallery = gr.Gallery(height=832, object_fit='contain', label='Outputs')
with gr.Row():
dummy_image_for_outputs = gr.Image(visible=False, label='Result')
# gr.Examples(
# fn=lambda *args: ([args[-1]], None),
# examples=db_examples.foreground_conditioned_examples,
# inputs=[
# input_fg, prompt, bg_source, image_width, image_height, seed, dummy_image_for_outputs
# ],
# outputs=[result_gallery, output_bg],
# run_on_click=True, examples_per_page=1024
# )
ips = [extracted_fg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, lowres_denoise, bg_source]
relight_button.click(fn=process_relight, inputs=ips, outputs=[extracted_fg, result_gallery], show_progress=True)
example_quick_prompts.click(lambda x, y: ', '.join(y.split(', ')[:2] + [x[0]]), inputs=[example_quick_prompts, prompt], outputs=prompt, show_progress=False, queue=False)
example_quick_subjects.click(lambda x: x[0], inputs=example_quick_subjects, outputs=prompt, show_progress=False, queue=False)
extract_button.click(
fn=extract_foreground,
inputs=[input_fg],
outputs=[extracted_fg, x_slider, y_slider], show_progress=True
)
find_objects_button.click(
fn=process_image,
inputs=[input_fg, text_prompt],
outputs=[extracted_objects, extracted_fg], show_progress=True
)
with gr.Tab("Background", visible=True):
# empty cache
mask_mover = MaskMover()
# with torch.no_grad():
# # Update the input channels to 12
# new_conv_in = torch.nn.Conv2d(12, unet.conv_in.out_channels, unet.conv_in.kernel_size, unet.conv_in.stride, unet.conv_in.padding) # Changed from 8 to 12
# new_conv_in.weight.zero_()
# new_conv_in.weight[:, :4, :, :].copy_(unet.conv_in.weight)
# new_conv_in.bias = unet.conv_in.bias
# unet.conv_in = new_conv_in
with gr.Row():
gr.Markdown("## IC-Light (Relighting with Foreground and Background Condition)")
gr.Markdown("πŸ’Ύ Generated images are automatically saved to 'outputs' folder")
with gr.Row():
with gr.Column():
# Step 1: Input and Extract
with gr.Row():
with gr.Group():
gr.Markdown("Extract Foreground")
input_image = gr.Image(type="numpy", label="Input Image", height=480)
with gr.Row():
with gr.Group():
find_objects_button = gr.Button(value="(Option 1) Segment Object from text")
text_prompt = gr.Textbox(
label="Text Prompt",
placeholder="Enter object classes separated by periods (e.g. 'car . person .'). Leave empty to get all objects",
value=""
)
extract_button = gr.Button(value="(Option 2) Remove Background")
with gr.Row():
extracted_objects = gr.Image(type="numpy", label="Extracted Foreground", height=480)
extracted_fg = gr.Image(type="numpy", label="Extracted Foreground", height=480)
with gr.Row():
# Step 2: Background and Position
with gr.Group():
gr.Markdown("Position on Background")
input_bg = gr.Image(type="numpy", label="Background Image", height=480)
reset_button = gr.Button(value="Reset Background")
with gr.Row():
x_slider = gr.Slider(
minimum=0,
maximum=1000,
label="X Position",
value=500,
visible=False
)
y_slider = gr.Slider(
minimum=0,
maximum=1000,
label="Y Position",
value=500,
visible=False
)
fg_scale_slider = gr.Slider(
label="Foreground Scale",
minimum=0.01,
maximum=3.0,
value=1.0,
step=0.01
)
editor = gr.ImageEditor(
type="numpy",
label="Position Foreground",
height=480,
visible=False
)
get_depth_button = gr.Button(value="Get Depth")
depth_image = gr.Image(type="numpy", label="Depth Image", height=480)
# Step 3: Relighting Options
with gr.Group():
gr.Markdown("### Step 3: Relighting Settings")
prompt = gr.Textbox(label="Prompt")
bg_source = gr.Radio(
choices=[e.value for e in BGSource],
value=BGSource.UPLOAD.value,
label="Background Source",
type='value',
visible=False
)
example_quick_subjects = gr.Dataset(samples=quick_subjects, label='Subject Quick List', samples_per_page=1000, components=[prompt])
example_quick_prompts = gr.Dataset(samples=quick_prompts, label='Lighting Quick List', samples_per_page=1000, components=[prompt])
# bg_gallery = gr.Gallery(
# height=450,
# label='Background Quick List',
# value=db_examples.bg_samples,
# columns=5,
# allow_preview=False
# )
relight_button_bg = gr.Button(value="Relight")
# Additional settings
with gr.Group(visible=False):
with gr.Row():
num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
seed = gr.Number(label="Seed", value=12345, precision=0)
with gr.Row():
image_width = gr.Slider(label="Image Width", minimum=256, maximum=1024, value=512, step=64)
image_height = gr.Slider(label="Image Height", minimum=256, maximum=1024, value=640, step=64)
with gr.Accordion("Advanced options", open=False):
steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=7.0, step=0.01, visible=False)
highres_scale = gr.Slider(label="Highres Scale", minimum=1.0, maximum=2.0, value=1.2, step=0.01)
highres_denoise = gr.Slider(label="Highres Denoise", minimum=0.1, maximum=0.9, value=0.5, step=0.01)
a_prompt = gr.Textbox(label="Added Prompt", value='best quality', visible=False)
n_prompt = gr.Textbox(
label="Negative Prompt",
value='lowres, bad anatomy, bad hands, cropped, worst quality', visible=False
)
with gr.Column():
result_gallery = gr.Image(height=832, label='Outputs')
original_bg = None
extract_button.click(
fn=extract_foreground,
inputs=[input_image],
outputs=[extracted_fg, x_slider, y_slider], show_progress=True
)
find_objects_button.click(
fn=process_image,
inputs=[input_image, text_prompt],
outputs=[extracted_objects, extracted_fg, x_slider, y_slider], show_progress=True
)
get_depth_button.click(
fn=get_depth,
inputs=[input_bg],
outputs=[depth_image], show_progress=True
)
# def update_position(background, x_pos, y_pos, scale):
# """Update composite when position changes"""
# global original_bg
# if background is None:
# return None
# if original_bg is None:
# original_bg = background.copy()
# # Convert string values to float
# x_pos = float(x_pos)
# y_pos = float(y_pos)
# scale = float(scale)
# return mask_mover.create_composite(original_bg, x_pos, y_pos, scale)
# class BackgroundManager:
# def __init__(self):
# self.original_bg = None # To store the original background
# def update_background(self, new_bg):
# """Set a new background."""
# if new_bg is not None:
# self.original_bg = new_bg.copy()
# print("Background updated successfully.") # Debugging
# def update_position(self, background, x_pos, y_pos, scale):
# """Update composite when position changes."""
# if self.original_bg is None:
# print("No original background set.")
# return None
# # Start from a clean copy of the original background
# fresh_bg = self.original_bg.copy()
# # Debugging
# print(f"Updating position: x={x_pos}, y={y_pos}, scale={scale}")
# # Composite the foreground onto the fresh background
# composite = mask_mover.create_composite(fresh_bg, x_pos, y_pos, scale)
# return composite
class BackgroundManager:
def __init__(self, mask_mover):
self.original_bg = None # To store the original background
self.current_bg = None # Store current background
self.mask_mover = mask_mover # Reference to the MaskMover instance
def update_background(self, new_bg):
"""Update the current background without affecting the mask"""
if new_bg is not None:
if self.original_bg is None: # Store the original background only once
self.original_bg = new_bg.copy()
self.current_bg = new_bg.copy() # Overwrite the current background
print("Background updated successfully.") # Debugging
def reset_background(self):
"""Reset the background to its original state"""
if self.original_bg is not None:
self.current_bg = self.original_bg.copy() # Reset to original background
print("Background reset to original state.") # Debugging
def update_position(self, x_pos, y_pos, scale):
"""Update composite when position changes."""
if self.original_bg is None:
print("No original background set.")
return None
# Use the original background from MaskMover
bg = self.mask_mover.original_fg # Assuming original_fg is the background
# Debugging
print(f"Updating position: x={x_pos}, y={y_pos}, scale={scale}")
# Composite the foreground onto the original background
composite = self.mask_mover.create_composite(bg, x_pos, y_pos, scale)
# Update the current mask
self.current_mask = composite
return composite
# Create an instance of BackgroundManager
#bg_manager = gr.State(BackgroundManager())
bg_manager = BackgroundManager(mask_mover)
def update_position_wrapper(background, x_pos, y_pos, scale):
return bg_manager.update_position(background, x_pos, y_pos, scale)
# def update_position(background, x_pos, y_pos, scale):
# if background is None:
# return None
# fresh_bg = bg_manager.original_bg.copy() # Start from a clean original background
# # Composite the extracted foreground onto fresh_bg
# return mask_mover.create_composite(fresh_bg, float(x_pos), float(y_pos), float(scale))
# input_bg.change(
# fn=lambda new_bg: bg_manager.update_background(new_bg) or new_bg,
# inputs=[input_bg],
# outputs=[input_bg],
# show_progress=False
# )
# x_slider.change(
# fn=bg_manager.update_position,
# inputs=[input_bg, x_slider, y_slider, fg_scale_slider],
# outputs=[input_bg]
# )
# y_slider.change(
# fn=bg_manager.update_position,
# inputs=[input_bg, x_slider, y_slider, fg_scale_slider],
# outputs=[input_bg]
# )
# fg_scale_slider.change(
# fn=bg_manager.update_position,
# inputs=[input_bg, x_slider, y_slider, fg_scale_slider],
# outputs=[input_bg]
# )
# Update the input_bg.change function to reset the mask
# input_bg.change(
# fn=lambda new_bg: bg_manager.update_background(new_bg) or new_bg,
# inputs=[input_bg],
# outputs=[input_bg],
# show_progress=False
# )
# Update the input_bg.change function to reset the mask and update the background
input_bg.change(
fn=lambda new_bg: bg_manager.update_background(new_bg), # Update the background
inputs=[input_bg],
outputs=[input_bg], # You may want to update other outputs as needed
show_progress=False
)
# Slider change functions
x_slider.change(
fn=lambda x_pos: bg_manager.update_position(x_pos, y_slider.value, fg_scale_slider.value),
inputs=[x_slider],
outputs=[input_bg] # This should not reset the background
)
y_slider.change(
fn=lambda y_pos: bg_manager.update_position(x_slider.value, y_pos, fg_scale_slider.value),
inputs=[y_slider],
outputs=[input_bg] # This should not reset the background
)
fg_scale_slider.change(
fn=lambda scale: bg_manager.update_position(x_slider.value, y_slider.value, scale),
inputs=[fg_scale_slider],
outputs=[input_bg] # This should not reset the background
)
# Update inputs list to include fg_scale_slider
def process_relight_with_position(*args):
if mask_mover.extracted_fg is None:
gr.Warning("Please extract foreground first")
return None
background = args[1] # Get background image
x_pos = float(args[-3]) # x_slider value
y_pos = float(args[-2]) # y_slider value
scale = float(args[-1]) # fg_scale_slider value
# Get original foreground size after scaling
fg = Image.fromarray(mask_mover.original_fg)
new_width = int(fg.width * scale)
new_height = int(fg.height * scale)
# Calculate crop region around foreground position
crop_x = int(x_pos - new_width/2)
crop_y = int(y_pos - new_height/2)
crop_width = new_width
crop_height = new_height
# Add padding for context (20% extra on each side)
padding = 0.2
crop_x = int(crop_x - crop_width * padding)
crop_y = int(crop_y - crop_height * padding)
crop_width = int(crop_width * (1 + 2 * padding))
crop_height = int(crop_height * (1 + 2 * padding))
# Ensure crop dimensions are multiples of 8
crop_width = ((crop_width + 7) // 8) * 8
crop_height = ((crop_height + 7) // 8) * 8
# Ensure crop region is within image bounds
bg_height, bg_width = background.shape[:2]
crop_x = max(0, min(crop_x, bg_width - crop_width))
crop_y = max(0, min(crop_y, bg_height - crop_height))
# Get actual crop dimensions after boundary check
crop_width = min(crop_width, bg_width - crop_x)
crop_height = min(crop_height, bg_height - crop_y)
# Ensure dimensions are multiples of 8 again
crop_width = (crop_width // 8) * 8
crop_height = (crop_height // 8) * 8
# Crop region from background
crop_region = background[crop_y:crop_y+crop_height, crop_x:crop_x+crop_width]
# Create composite in cropped region
fg_local_x = int(new_width/2 + crop_width*padding)
fg_local_y = int(new_height/2 + crop_height*padding)
cropped_composite = mask_mover.create_composite(crop_region, fg_local_x, fg_local_y, scale)
#Process the cropped region
crop_args = list(args)
crop_args[0] = cropped_composite
crop_args[1] = crop_region
crop_args[3] = crop_width
crop_args[4] = crop_height
crop_args = crop_args[:-3] # Remove position and scale arguments
# crop_args = list(args)
# crop_args[0] = None # or an empty placeholder for fg if needed
# crop_args[1] = crop_region # keep the background as is
# crop_args[3] = crop_width
# crop_args[4] = crop_height
# crop_args = crop_args[:-3] # remove position/scale arguments
# Get relit result
relit_crop = process_relight_bg(*crop_args)[0]
# Resize relit result to match crop dimensions if needed
if relit_crop.shape[:2] != (crop_height, crop_width):
relit_crop = resize_without_crop(relit_crop, crop_width, crop_height)
final_composite = mask_mover.create_composite(relit_crop, fg_local_x, fg_local_y, scale)
# Place relit crop back into original background
result = background.copy()
#result[crop_y:crop_y+crop_height, crop_x:crop_x+crop_width] = relit_crop
result[crop_y:crop_y+crop_height, crop_x:crop_x+crop_width] = final_composite
return result
if original_bg:
chosen_bg = original_bg.copy()
else:
chosen_bg = input_bg
ips_bg = [input_fg, chosen_bg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, bg_source, x_slider, y_slider, fg_scale_slider]
# Update button click events with new inputs list
relight_button_bg.click(
fn=process_relight_with_position,
inputs=ips_bg,
outputs=[result_gallery], show_progress=True
)
# Connect the reset button to the reset functionality
reset_button.click(
fn=lambda: bg_manager.reset_background(), # Call the reset function
inputs=[], # No inputs needed for reset
outputs=[input_bg], # Update the displayed background
show_progress=True
)
example_quick_prompts.click(lambda x, y: ', '.join(y.split(', ')[:2] + [x[0]]), inputs=[example_quick_prompts, prompt], outputs=prompt, show_progress=False, queue=False)
example_quick_subjects.click(lambda x: x[0], inputs=example_quick_subjects, outputs=prompt, show_progress=False, queue=False)
# example_prompts.click(
# fn=lambda x: x[0],
# inputs=example_prompts,
# outputs=prompt,
# show_progress=False,
# queue=False
# )
block.launch(server_name='0.0.0.0', share=False)