Spaces:
Sleeping
Sleeping
import os | |
os.environ ['HF_ENDPOINT'] = 'https://hf-mirror.com' | |
from ast import main | |
from numpy import imag | |
import torch | |
from diffusers import StableDiffusionPipeline | |
import os | |
from PIL import Image | |
def normalize_bbox(bboxes, img_width, img_height): | |
normalized_bboxes = [] | |
for box in bboxes: | |
x_min, y_min, x_max, y_max = box | |
x_min = (x_min / img_width) | |
y_min = (y_min / img_height) | |
x_max = (x_max / img_width) | |
y_max = (y_max / img_height) | |
normalized_bboxes.append([x_min, y_min, x_max, y_max]) | |
return normalized_bboxes | |
def create_reco_prompt( | |
caption: str = '', | |
phrases=[], | |
boxes=[], | |
normalize_boxes=True, | |
image_resolution=512, | |
num_bins=1000, | |
): | |
""" | |
method to create ReCo prompt | |
caption: global caption | |
phrases: list of regional captions | |
boxes: list of regional coordinates (unnormalized xyxy) | |
""" | |
SOS_token = '<|startoftext|>' | |
EOS_token = '<|endoftext|>' | |
box_captions_with_coords = [] | |
box_captions_with_coords += [caption] | |
box_captions_with_coords += [EOS_token] | |
for phrase, box in zip(phrases, boxes): | |
if normalize_boxes: | |
box = [float(x) / image_resolution for x in box] | |
# quantize into bins | |
quant_x0 = int(round((box[0] * (num_bins - 1)))) | |
quant_y0 = int(round((box[1] * (num_bins - 1)))) | |
quant_x1 = int(round((box[2] * (num_bins - 1)))) | |
quant_y1 = int(round((box[3] * (num_bins - 1)))) | |
# ReCo format | |
# Add SOS/EOS before/after regional captions | |
box_captions_with_coords += [ | |
f"<bin{str(quant_x0).zfill(3)}>", | |
f"<bin{str(quant_y0).zfill(3)}>", | |
f"<bin{str(quant_x1).zfill(3)}>", | |
f"<bin{str(quant_y1).zfill(3)}>", | |
SOS_token, | |
phrase, | |
EOS_token | |
] | |
text = " ".join(box_captions_with_coords) | |
return text | |
def inference_image(pipe, prompt, grounding_instruction, state): | |
print(prompt) | |
print(grounding_instruction) | |
bbox = state['boxes'] | |
# bbox = state | |
print(bbox) | |
bbox = normalize_bbox(bbox, 600, 600) | |
print(bbox) | |
objects = [obj for obj in grounding_instruction.split(';') if obj.strip()] | |
print(objects) | |
prompt_reco = create_reco_prompt(prompt, objects, bbox, normalize_boxes=False) | |
print(prompt_reco) | |
image = pipe(prompt_reco, guidance_scale=4).images[0] | |
return image | |
if __name__ == "__main__": | |
path = '/home/bcy/cache/.cache/huggingface/hub/models--j-min--reco_sd14_coco/snapshots/11a062da5a0a84501047cb19e113f520eb610415' if os.path.isdir('/home/bcy/cache/.cache/huggingface/hub/models--j-min--reco_sd14_coco/snapshots/11a062da5a0a84501047cb19e113f520eb610415') else "CompVis/stable-diffusion-v1-4" | |
pipe = StableDiffusionPipeline.from_pretrained( | |
"j-min/reco_sd14_coco", | |
torch_dtype=torch.float16 | |
) | |
pipe = pipe.to("cuda") | |
# caption = "A box contains six donuts with varying types of glazes and toppings." | |
# phrases = ["chocolate donut.", "dark vanilla donut.", "donut with sprinkles.", "donut with powdered sugar.", "pink donut.", "brown donut."] | |
# boxes = [[263.68, 294.912, 380.544, 392.832], [121.344, 265.216, 267.392, 401.92], [391.168, 294.912, 506.368, 381.952], [120.064, 143.872, 268.8, 270.336], [264.192, 132.928, 393.216, 263.68], [386.048, 148.48, 490.688, 259.584]] | |
# prompt = create_reco_prompt(caption, phrases, boxes) | |
# print(prompt) | |
# generated_image = pipe( | |
# prompt, | |
# guidance_scale=4).images[0] | |
# generated_image.save("output1.jpg") | |
prompt = "a dog and a cat;" | |
grounding_instruction = "cut dog; big cat;" | |
bbox = [(136, 252, 280, 455), (284, 205, 480, 500)] | |
inference_image(pipe, prompt, grounding_instruction, bbox) | |