Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -2,7 +2,7 @@ import spaces
|
|
2 |
import gradio as gr
|
3 |
import torch
|
4 |
from diffusers import AutoPipelineForInpainting
|
5 |
-
from PIL import Image
|
6 |
from transformers import (
|
7 |
AutoModelForCausalLM,
|
8 |
AutoTokenizer,
|
@@ -21,7 +21,7 @@ def delete_model(model):
|
|
21 |
torch.cuda.empty_cache()
|
22 |
|
23 |
@spaces.GPU()
|
24 |
-
def run_language_model(edit_prompt, device):
|
25 |
language_model_id = "Qwen/Qwen1.5-0.5B-Chat"
|
26 |
language_model = AutoModelForCausalLM.from_pretrained(
|
27 |
language_model_id, device_map="auto"
|
@@ -29,19 +29,27 @@ def run_language_model(edit_prompt, device):
|
|
29 |
tokenizer = AutoTokenizer.from_pretrained(language_model_id)
|
30 |
messages = [
|
31 |
{"role": "system", "content": "Follow the examples and return the expected output"},
|
32 |
-
{"role": "user", "content": "
|
33 |
-
{"role": "assistant", "content": "
|
34 |
-
{"role": "user", "content": "
|
35 |
-
{"role": "assistant", "content": "dog,
|
36 |
-
{"role": "user", "content": "
|
37 |
-
{"role": "assistant", "content": "
|
38 |
-
{"role": "user", "content": "
|
39 |
-
{"role": "assistant", "content": "
|
40 |
-
{"role": "user", "content": "
|
41 |
-
{"role": "assistant", "content": "
|
42 |
-
{"role": "user", "content": "
|
43 |
-
{"role": "assistant", "content": "
|
44 |
-
{"role": "user", "content":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
]
|
46 |
text = tokenizer.apply_chat_template(
|
47 |
messages,
|
@@ -61,10 +69,13 @@ def run_language_model(edit_prompt, device):
|
|
61 |
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
|
62 |
]
|
63 |
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
64 |
-
|
|
|
|
|
|
|
65 |
|
66 |
delete_model(language_model)
|
67 |
-
return (to_replace,
|
68 |
|
69 |
@spaces.GPU()
|
70 |
def run_image_captioner(image, device):
|
@@ -120,13 +131,16 @@ def run_segmentation(image, object_to_segment, device):
|
|
120 |
return masks
|
121 |
|
122 |
@spaces.GPU()
|
123 |
-
def run_inpainting(image, replaced_caption, masks, device):
|
124 |
pipeline = AutoPipelineForInpainting.from_pretrained(
|
125 |
"diffusers/stable-diffusion-xl-1.0-inpainting-0.1",
|
126 |
torch_dtype=torch.float16,
|
127 |
variant="fp16",
|
128 |
).to(device)
|
129 |
|
|
|
|
|
|
|
130 |
prompt = replaced_caption
|
131 |
negative_prompt = """lowres, bad anatomy, bad hands,
|
132 |
text, error, missing fingers, extra digit, fewer digits,
|
@@ -135,10 +149,11 @@ def run_inpainting(image, replaced_caption, masks, device):
|
|
135 |
output = pipeline(
|
136 |
prompt=prompt,
|
137 |
image=image,
|
138 |
-
mask_image=
|
139 |
negative_prompt=negative_prompt,
|
140 |
guidance_scale=7.5,
|
141 |
strength=1.0,
|
|
|
142 |
).images[0]
|
143 |
|
144 |
delete_model(pipeline)
|
@@ -151,24 +166,22 @@ def run_open_gen_fill(image, edit_prompt):
|
|
151 |
# Resize the image to (512, 512)
|
152 |
image = image.resize((512, 512))
|
153 |
|
154 |
-
# Run the langauge model to extract the objects to be swapped from
|
155 |
-
# the edit prompt
|
156 |
-
to_replace, replace_with = run_language_model(
|
157 |
-
edit_prompt=edit_prompt, device=device
|
158 |
-
)
|
159 |
-
|
160 |
# Caption the input image
|
161 |
caption = run_image_captioner(image, device=device)
|
162 |
|
163 |
-
#
|
164 |
-
|
|
|
|
|
|
|
165 |
|
166 |
# Segment the `to_replace` object from the input image
|
167 |
masks = run_segmentation(image, to_replace, device=device)
|
168 |
|
169 |
# Diffusion pipeline for inpainting
|
|
|
170 |
output = run_inpainting(
|
171 |
-
image=image, replaced_caption=replaced_caption, masks=masks, device=device
|
172 |
)
|
173 |
|
174 |
return (
|
|
|
2 |
import gradio as gr
|
3 |
import torch
|
4 |
from diffusers import AutoPipelineForInpainting
|
5 |
+
from PIL import Image, ImageFilter
|
6 |
from transformers import (
|
7 |
AutoModelForCausalLM,
|
8 |
AutoTokenizer,
|
|
|
21 |
torch.cuda.empty_cache()
|
22 |
|
23 |
@spaces.GPU()
|
24 |
+
def run_language_model(edit_prompt, caption, device):
|
25 |
language_model_id = "Qwen/Qwen1.5-0.5B-Chat"
|
26 |
language_model = AutoModelForCausalLM.from_pretrained(
|
27 |
language_model_id, device_map="auto"
|
|
|
29 |
tokenizer = AutoTokenizer.from_pretrained(language_model_id)
|
30 |
messages = [
|
31 |
{"role": "system", "content": "Follow the examples and return the expected output"},
|
32 |
+
{"role": "user", "content": "Caption: a blue sky with fluffy clouds\nQuery: Make the sky stormy"},
|
33 |
+
{"role": "assistant", "content": "A: sky\nB: a stormy sky with heavy gray clouds, torrential rain, gloomy, overcast"},
|
34 |
+
{"role": "user", "content": "Caption: a cat sleeping on a sofa\nQuery: Change the cat to a dog"},
|
35 |
+
{"role": "assistant", "content": "A: cat\nB: a dog sleeping on a sofa, cozy and comfortable, snuggled up in a warm blanket, peaceful"},
|
36 |
+
{"role": "user", "content": "Caption: a snowy mountain peak\nQuery: Replace the snow with greenery"},
|
37 |
+
{"role": "assistant", "content": "A: snow\nB: a lush green mountain peak in summer, clear blue skies, birds flying overhead, serene and majestic"},
|
38 |
+
{"role": "user", "content": "Caption: a vintage car parked by the roadside\nQuery: Change the car to a modern electric vehicle"},
|
39 |
+
{"role": "assistant", "content": "A: car\nB: a sleek modern electric vehicle parked by the roadside, cutting-edge design, environmentally friendly, silent and powerful"},
|
40 |
+
{"role": "user", "content": "Caption: a wooden bridge over a river\nQuery: Make the bridge stone"},
|
41 |
+
{"role": "assistant", "content": "A: bridge\nB: an ancient stone bridge over a river, moss-covered, sturdy and timeless, with clear waters flowing beneath"},
|
42 |
+
{"role": "user", "content": "Caption: a bowl of salad on the table\nQuery: Replace salad with soup"},
|
43 |
+
{"role": "assistant", "content": "A: bowl\nB: a bowl of steaming hot soup on the table, scrumptious, with garnishing"},
|
44 |
+
{"role": "user", "content": "Caption: a book on a desk surrounded by stationery\nQuery: Remove all stationery, add a laptop"},
|
45 |
+
{"role": "assistant", "content": "A: stationery\nB: a book on a desk with a laptop next to it, modern study setup, focused and productive, technology and education combined"},
|
46 |
+
{"role": "user", "content": "Caption: a cup of coffee on a wooden table\nQuery: Change coffee to tea"},
|
47 |
+
{"role": "assistant", "content": "A: cup\nB: a steaming cup of tea on a wooden table, calming and aromatic, with a slice of lemon on the side, inviting"},
|
48 |
+
{"role": "user", "content": "Caption: a small pen on a white table\nQuery: Change the pen to an elaborate fountain pen"},
|
49 |
+
{"role": "assistant", "content": "A: pen\nB: an elaborate fountain pen on a white table, sleek and elegant, with intricate designs, ready for writing"},
|
50 |
+
{"role": "user", "content": "Caption: a plain notebook on a desk\nQuery: Replace the notebook with a journal"},
|
51 |
+
{"role": "assistant", "content": "A: notebook\nB: an artistically decorated journal on a desk, vibrant cover, filled with creativity, inspiring and personalized"},
|
52 |
+
{"role": "user", "content": f"Caption: {caption}\nQuery: {edit_prompt}"},
|
53 |
]
|
54 |
text = tokenizer.apply_chat_template(
|
55 |
messages,
|
|
|
69 |
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
|
70 |
]
|
71 |
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
72 |
+
|
73 |
+
output_generation_a, output_generation_b = response.split("\n")
|
74 |
+
to_replace = output_generation_a[2:].strip()
|
75 |
+
replace_caption = output_generation_b[2:].strip()
|
76 |
|
77 |
delete_model(language_model)
|
78 |
+
return (to_replace, replace_caption)
|
79 |
|
80 |
@spaces.GPU()
|
81 |
def run_image_captioner(image, device):
|
|
|
131 |
return masks
|
132 |
|
133 |
@spaces.GPU()
|
134 |
+
def run_inpainting(image, replaced_caption, masks, generator, device):
|
135 |
pipeline = AutoPipelineForInpainting.from_pretrained(
|
136 |
"diffusers/stable-diffusion-xl-1.0-inpainting-0.1",
|
137 |
torch_dtype=torch.float16,
|
138 |
variant="fp16",
|
139 |
).to(device)
|
140 |
|
141 |
+
masks = Image.fromarray(masks.numpy())
|
142 |
+
dilation_image = masks.filter(ImageFilter.MaxFilter(3))
|
143 |
+
|
144 |
prompt = replaced_caption
|
145 |
negative_prompt = """lowres, bad anatomy, bad hands,
|
146 |
text, error, missing fingers, extra digit, fewer digits,
|
|
|
149 |
output = pipeline(
|
150 |
prompt=prompt,
|
151 |
image=image,
|
152 |
+
mask_image=dilation_image,
|
153 |
negative_prompt=negative_prompt,
|
154 |
guidance_scale=7.5,
|
155 |
strength=1.0,
|
156 |
+
generator=generator,
|
157 |
).images[0]
|
158 |
|
159 |
delete_model(pipeline)
|
|
|
166 |
# Resize the image to (512, 512)
|
167 |
image = image.resize((512, 512))
|
168 |
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
# Caption the input image
|
170 |
caption = run_image_captioner(image, device=device)
|
171 |
|
172 |
+
# Run the langauge model to extract the object for segmentation
|
173 |
+
# and get the replaced caption
|
174 |
+
to_replace, replace_caption = run_language_model(
|
175 |
+
edit_prompt=edit_prompt, caption=caption, device=device
|
176 |
+
)
|
177 |
|
178 |
# Segment the `to_replace` object from the input image
|
179 |
masks = run_segmentation(image, to_replace, device=device)
|
180 |
|
181 |
# Diffusion pipeline for inpainting
|
182 |
+
generator = torch.Generator(device).manual_seed(17)
|
183 |
output = run_inpainting(
|
184 |
+
image=image, replaced_caption=replaced_caption, masks=masks, generator=generator, device=device
|
185 |
)
|
186 |
|
187 |
return (
|