import os os.environ ['HF_ENDPOINT'] = 'https://hf-mirror.com' from ast import main from numpy import imag import torch from diffusers import StableDiffusionPipeline import os from PIL import Image def normalize_bbox(bboxes, img_width, img_height): normalized_bboxes = [] for box in bboxes: x_min, y_min, x_max, y_max = box x_min = (x_min / img_width) y_min = (y_min / img_height) x_max = (x_max / img_width) y_max = (y_max / img_height) normalized_bboxes.append([x_min, y_min, x_max, y_max]) return normalized_bboxes def create_reco_prompt( caption: str = '', phrases=[], boxes=[], normalize_boxes=True, image_resolution=512, num_bins=1000, ): """ method to create ReCo prompt caption: global caption phrases: list of regional captions boxes: list of regional coordinates (unnormalized xyxy) """ SOS_token = '<|startoftext|>' EOS_token = '<|endoftext|>' box_captions_with_coords = [] box_captions_with_coords += [caption] box_captions_with_coords += [EOS_token] for phrase, box in zip(phrases, boxes): if normalize_boxes: box = [float(x) / image_resolution for x in box] # quantize into bins quant_x0 = int(round((box[0] * (num_bins - 1)))) quant_y0 = int(round((box[1] * (num_bins - 1)))) quant_x1 = int(round((box[2] * (num_bins - 1)))) quant_y1 = int(round((box[3] * (num_bins - 1)))) # ReCo format # Add SOS/EOS before/after regional captions box_captions_with_coords += [ f"", f"", f"", f"", SOS_token, phrase, EOS_token ] text = " ".join(box_captions_with_coords) return text def inference_image(pipe, prompt, grounding_instruction, state): print(prompt) print(grounding_instruction) bbox = state['boxes'] # bbox = state print(bbox) bbox = normalize_bbox(bbox, 600, 600) print(bbox) objects = [obj for obj in grounding_instruction.split(';') if obj.strip()] print(objects) prompt_reco = create_reco_prompt(prompt, objects, bbox, normalize_boxes=False) print(prompt_reco) image = pipe(prompt_reco, guidance_scale=4).images[0] return image if __name__ == "__main__": path = '/home/bcy/cache/.cache/huggingface/hub/models--j-min--reco_sd14_coco/snapshots/11a062da5a0a84501047cb19e113f520eb610415' if os.path.isdir('/home/bcy/cache/.cache/huggingface/hub/models--j-min--reco_sd14_coco/snapshots/11a062da5a0a84501047cb19e113f520eb610415') else "CompVis/stable-diffusion-v1-4" pipe = StableDiffusionPipeline.from_pretrained( "j-min/reco_sd14_coco", torch_dtype=torch.float16 ) pipe = pipe.to("cuda") # caption = "A box contains six donuts with varying types of glazes and toppings." # phrases = ["chocolate donut.", "dark vanilla donut.", "donut with sprinkles.", "donut with powdered sugar.", "pink donut.", "brown donut."] # boxes = [[263.68, 294.912, 380.544, 392.832], [121.344, 265.216, 267.392, 401.92], [391.168, 294.912, 506.368, 381.952], [120.064, 143.872, 268.8, 270.336], [264.192, 132.928, 393.216, 263.68], [386.048, 148.48, 490.688, 259.584]] # prompt = create_reco_prompt(caption, phrases, boxes) # print(prompt) # generated_image = pipe( # prompt, # guidance_scale=4).images[0] # generated_image.save("output1.jpg") prompt = "a dog and a cat;" grounding_instruction = "cut dog; big cat;" bbox = [(136, 252, 280, 455), (284, 205, 480, 500)] inference_image(pipe, prompt, grounding_instruction, bbox)