File size: 1,952 Bytes
2c0adca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# Using reference image to add object in generated image
import torch
from diffusers import StableDiffusionGLIGENTextImagePipeline
from diffusers.utils import load_image

def normalize_bbox(bboxes, img_width, img_height):
    normalized_bboxes = []
    for box in bboxes:
        x_min, y_min, x_max, y_max = box
        
        x_min = (x_min / img_width)
        y_min = (y_min / img_height)
        x_max = (x_max / img_width)
        y_max = (y_max / img_height)
        
        normalized_bboxes.append([x_min, y_min, x_max, y_max])
    
    return normalized_bboxes

def inference_image(pipe, prompt, grounding_instruction, state):
    print(prompt)
    print(grounding_instruction)
    bbox = state['boxes']
    # bbox = state
    print(bbox)
    bbox = normalize_bbox(bbox, 600, 600)
    print(bbox)
    objects = [obj for obj in grounding_instruction.split(';') if obj.strip()]
    print(objects)

    image = pipe(
        prompt=prompt,
        gligen_phrases=grounding_instruction,
        gligen_images=[],
        gligen_boxes=bbox,
        gligen_scheduled_sampling_beta=1,
        output_type="pil",
        num_inference_steps=50,
    ).images[0]
    return image



if __name__ == "__main__":
    
    pipe = StableDiffusionGLIGENTextImagePipeline.from_pretrained("anhnct/Gligen_Text_Image", torch_dtype=torch.float16)
    pipe = pipe.to("cuda")
    prompt = "a flower sitting on the beach"
    boxes = [[0.0, 0.09, 0.53, 0.76]]
    phrases = ["flower"]
    # gligen_image = load_image(
    #     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/gligen/pexels-pixabay-60597.jpg"
    # )

    images = pipe(
        prompt=prompt,
        gligen_phrases=phrases,
        gligen_images=[],
        gligen_boxes=boxes,
        gligen_scheduled_sampling_beta=1,
        output_type="pil",
        num_inference_steps=50,
    ).images

    images[0].save("./gligen-generation-text-image-box.jpg")