Spaces:
gaur3009
/
Runtime error

gaur3009 commited on
Commit
713a6b3
ยท
verified ยท
1 Parent(s): cce2f96

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +139 -123
app.py CHANGED
@@ -1,9 +1,7 @@
1
- import warnings
2
- warnings.filterwarnings("ignore", category=FutureWarning)
3
-
4
- import torch
5
- import gradio as gr
6
  from PIL import Image
 
7
  from src.tryon_pipeline import StableDiffusionXLInpaintPipeline as TryonPipeline
8
  from src.unet_hacked_garmnet import UNet2DConditionModel as UNet2DConditionModel_ref
9
  from src.unet_hacked_tryon import UNet2DConditionModel
@@ -13,44 +11,46 @@ from transformers import (
13
  CLIPTextModel,
14
  CLIPTextModelWithProjection,
15
  )
16
- from diffusers import DDPMScheduler, AutoencoderKL
17
  from typing import List
18
 
19
- import numpy as np
20
  import os
 
 
21
  from utils_mask import get_mask_location
22
  from torchvision import transforms
23
  import apply_net
24
  from preprocess.humanparsing.run_parsing import Parsing
25
  from preprocess.openpose.run_openpose import OpenPose
26
- from detectron2.data.detection_utils import convert_PIL_to_numpy, _apply_exif_orientation
27
  from torchvision.transforms.functional import to_pil_image
28
 
29
- # Function to convert PIL image to binary mask
 
30
  def pil_to_binary_mask(pil_image, threshold=0):
31
  np_image = np.array(pil_image)
32
  grayscale_image = Image.fromarray(np_image).convert("L")
33
  binary_mask = np.array(grayscale_image) > threshold
34
  mask = np.zeros(binary_mask.shape, dtype=np.uint8)
35
- for i in range(binary_mask.shape):
36
- for j in range(binary_mask.shape):
37
- if binary_mask[i, j] == True:
38
- mask[i, j] = 1
39
- mask = (mask * 255).astype(np.uint8)
40
  output_mask = Image.fromarray(mask)
41
  return output_mask
42
 
 
43
  base_path = 'yisol/IDM-VTON'
44
  example_path = os.path.join(os.path.dirname(__file__), 'example')
45
 
46
- # Load models with lower precision (float16) to reduce memory usage
47
  unet = UNet2DConditionModel.from_pretrained(
48
  base_path,
49
  subfolder="unet",
50
  torch_dtype=torch.float16,
51
  )
52
  unet.requires_grad_(False)
53
-
54
  tokenizer_one = AutoTokenizer.from_pretrained(
55
  base_path,
56
  subfolder="tokenizer",
@@ -63,7 +63,6 @@ tokenizer_two = AutoTokenizer.from_pretrained(
63
  revision=None,
64
  use_fast=False,
65
  )
66
-
67
  noise_scheduler = DDPMScheduler.from_pretrained(base_path, subfolder="scheduler")
68
 
69
  text_encoder_one = CLIPTextModel.from_pretrained(
@@ -80,10 +79,13 @@ image_encoder = CLIPVisionModelWithProjection.from_pretrained(
80
  base_path,
81
  subfolder="image_encoder",
82
  torch_dtype=torch.float16,
 
 
 
 
83
  )
84
 
85
- vae = AutoencoderKL.from_pretrained(base_path, subfolder="vae", torch_dtype=torch.float16)
86
-
87
  UNet_Encoder = UNet2DConditionModel_ref.from_pretrained(
88
  base_path,
89
  subfolder="unet_encoder",
@@ -99,40 +101,37 @@ vae.requires_grad_(False)
99
  unet.requires_grad_(False)
100
  text_encoder_one.requires_grad_(False)
101
  text_encoder_two.requires_grad_(False)
102
-
103
- tensor_transform = transforms.Compose(
104
- [
105
- transforms.ToTensor(),
106
- transforms.Normalize([0.5], [0.5]),
107
- ]
108
- )
109
 
110
  pipe = TryonPipeline.from_pretrained(
111
- base_path,
112
- unet=unet,
113
- vae=vae,
114
- feature_extractor=CLIPImageProcessor(),
115
- text_encoder=text_encoder_one,
116
- text_encoder_2=text_encoder_two,
117
- tokenizer=tokenizer_one,
118
- tokenizer_2=tokenizer_two,
119
- scheduler=noise_scheduler,
120
- image_encoder=image_encoder,
121
- torch_dtype=torch.float16,
122
  )
123
  pipe.unet_encoder = UNet_Encoder
124
 
125
- @grSpaces.GPU
126
- def start_tryon(dict, garm_img, garment_des, is_checked, is_checked_crop, denoise_steps, seed):
127
- device = "cuda"
128
 
129
  openpose_model.preprocessor.body_estimation.model.to(device)
130
  pipe.to(device)
131
  pipe.unet_encoder.to(device)
132
 
133
- garm_img = garm_img.convert("RGB").resize((768, 1024))
134
- human_img_orig = dict["background"].convert("RGB")
135
-
136
  if is_checked_crop:
137
  width, height = human_img_orig.size
138
  target_width = int(min(width, height * (3 / 4)))
@@ -143,112 +142,121 @@ def start_tryon(dict, garm_img, garment_des, is_checked, is_checked_crop, denois
143
  bottom = (height + target_height) / 2
144
  cropped_img = human_img_orig.crop((left, top, right, bottom))
145
  crop_size = cropped_img.size
146
- human_img = cropped_img.resize((768, 1024))
147
  else:
148
- human_img = human_img_orig.resize((768, 1024))
 
149
 
150
  if is_checked:
151
- keypoints = openpose_model(human_img.resize((384, 512)))
152
- model_parse, _ = parsing_model(human_img.resize((384, 512)))
153
  mask, mask_gray = get_mask_location('hd', "upper_body", model_parse, keypoints)
154
- mask = mask.resize((768, 1024))
155
  else:
156
- mask = pil_to_binary_mask(dict['layers'].convert("RGB").resize((768, 1024)))
 
 
 
 
157
 
158
- mask_gray = (1 - transforms.ToTensor()(mask)) * tensor_transform(human_img)
159
- mask_gray = to_pil_image((mask_gray + 1.0) / 2.0)
160
 
161
- human_img_arg = _apply_exif_orientation(human_img.resize((384, 512)))
162
  human_img_arg = convert_PIL_to_numpy(human_img_arg, format="BGR")
 
 
163
 
164
  args = apply_net.create_argument_parser().parse_args(('show', './configs/densepose_rcnn_R_50_FPN_s1x.yaml', './ckpt/densepose/model_final_162be9.pkl', 'dp_segm', '-v', '--opts', 'MODEL.DEVICE', 'cuda'))
165
- pose_img = args.func(args, human_img_arg)
166
- pose_img = pose_img[:, :, ::-1]
167
- pose_img = Image.fromarray(pose_img).resize((768, 1024))
168
-
169
- with torch.cuda.amp.autocast():
170
- with torch.no_grad():
171
- prompt = "model is wearing " + garment_des
172
- negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
173
- with torch.inference_mode():
174
- (
175
- prompt_embeds,
176
- negative_prompt_embeds,
177
- pooled_prompt_embeds,
178
- negative_pooled_prompt_embeds,
179
- ) = pipe.encode_prompt(
180
- prompt,
181
- num_images_per_prompt=1,
182
- do_classifier_free_guidance=True,
183
- negative_prompt=negative_prompt,
184
- )
185
-
186
- prompt = "a photo of " + garment_des
187
  negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
188
- if not isinstance(prompt, List):
189
- prompt = [prompt] * 1
190
- if not isinstance(negative_prompt, List):
191
- negative_prompt = [negative_prompt] * 1
192
  with torch.inference_mode():
193
  (
194
- prompt_embeds_c,
195
- _,
196
- _,
197
- _,
198
  ) = pipe.encode_prompt(
199
  prompt,
200
  num_images_per_prompt=1,
201
- do_classifier_free_guidance=False,
202
  negative_prompt=negative_prompt,
203
  )
204
-
205
- pose_img = tensor_transform(pose_img).unsqueeze(0).to(device, torch.float16)
206
- garm_tensor = tensor_transform(garm_img).unsqueeze(0).to(device, torch.float16)
207
- generator = torch.Generator(device).manual_seed(seed) if seed is not None else None
208
- images = pipe(
209
- prompt_embeds=prompt_embeds.to(device, torch.float16),
210
- negative_prompt_embeds=negative_prompt_embeds.to(device, torch.float16),
211
- pooled_prompt_embeds=pooled_prompt_embeds.to(device, torch.float16),
212
- negative_pooled_prompt_embeds=negative_pooled_prompt_embeds.to(device, torch.float16),
213
- num_inference_steps=denoise_steps,
214
- generator=generator,
215
- strength=1.0,
216
- pose_img=pose_img.to(device, torch.float16),
217
- text_embeds_cloth=prompt_embeds_c.to(device, torch.float16),
218
- cloth=garm_tensor.to(device, torch.float16),
219
- mask_image=mask,
220
- image=human_img,
221
- height=1024,
222
- width=768,
223
- ip_adapter_image=garm_img.resize((768, 1024)),
224
- guidance_scale=2.0,
225
- )
226
-
227
- # Clear GPU memory after inference
228
- torch.cuda.empty_cache()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
 
230
  if is_checked_crop:
231
- out_img = images.resize(crop_size)
232
- human_img_orig.paste(out_img, (int(left), int(top)))
233
  return human_img_orig, mask_gray
234
  else:
235
- return images, mask_gray
 
236
 
237
- garm_list = os.listdir(os.path.join(example_path, "cloth"))
238
- garm_list_path = [os.path.join(example_path, "cloth", garm) for garm in garm_list]
239
 
240
- human_list = os.listdir(os.path.join(example_path, "human"))
241
- human_list_path = [os.path.join(example_path, "human", human) for human in human_list]
242
 
243
  human_ex_list = []
244
  for ex_human in human_list_path:
245
- ex_dict = {}
246
  ex_dict['background'] = ex_human
247
  ex_dict['layers'] = None
248
  ex_dict['composite'] = None
249
  human_ex_list.append(ex_dict)
250
 
251
- # Default human
 
252
 
253
  image_blocks = gr.Blocks().queue()
254
  with image_blocks as demo:
@@ -258,9 +266,9 @@ with image_blocks as demo:
258
  with gr.Column():
259
  imgs = gr.ImageEditor(sources='upload', type="pil", label='Human. Mask with pen or use auto-masking', interactive=True)
260
  with gr.Row():
261
- is_checked = gr.Checkbox(label="Yes", info="Use auto-generated mask (Takes 5 seconds)", value=True)
262
  with gr.Row():
263
- is_checked_crop = gr.Checkbox(label="Yes", info="Use auto-crop & resizing", value=False)
264
 
265
  example = gr.Examples(
266
  inputs=imgs,
@@ -279,10 +287,13 @@ with image_blocks as demo:
279
  examples=garm_list_path)
280
  with gr.Column():
281
  # image_out = gr.Image(label="Output", elem_id="output-img", height=400)
282
- masked_img = gr.Image(label="Masked image output", elem_id="masked-img", show_share_button=False)
283
  with gr.Column():
284
  # image_out = gr.Image(label="Output", elem_id="output-img", height=400)
285
- image_out = gr.Image(label="Output", elem_id="output-img", show_share_button=False)
 
 
 
286
 
287
  with gr.Column():
288
  try_button = gr.Button(value="Try-on")
@@ -291,6 +302,11 @@ with image_blocks as demo:
291
  denoise_steps = gr.Number(label="Denoising Steps", minimum=20, maximum=40, value=30, step=1)
292
  seed = gr.Number(label="Seed", minimum=-1, maximum=2147483647, step=1, value=42)
293
 
294
- try_button.click(fn=start_tryon, inputs=[imgs, garm_img, prompt, is_checked, is_checked_crop, denoise_steps, seed], outputs=[image_out, masked_img], api_name='tryon')
295
 
296
- image_blocks.launch()
 
 
 
 
 
 
 
1
+ import sys
2
+ sys.path.append('./')
 
 
 
3
  from PIL import Image
4
+ import gradio as gr
5
  from src.tryon_pipeline import StableDiffusionXLInpaintPipeline as TryonPipeline
6
  from src.unet_hacked_garmnet import UNet2DConditionModel as UNet2DConditionModel_ref
7
  from src.unet_hacked_tryon import UNet2DConditionModel
 
11
  CLIPTextModel,
12
  CLIPTextModelWithProjection,
13
  )
14
+ from diffusers import DDPMScheduler,AutoencoderKL
15
  from typing import List
16
 
17
+ import torch
18
  import os
19
+ from transformers import AutoTokenizer
20
+ import numpy as np
21
  from utils_mask import get_mask_location
22
  from torchvision import transforms
23
  import apply_net
24
  from preprocess.humanparsing.run_parsing import Parsing
25
  from preprocess.openpose.run_openpose import OpenPose
26
+ from detectron2.data.detection_utils import convert_PIL_to_numpy,_apply_exif_orientation
27
  from torchvision.transforms.functional import to_pil_image
28
 
29
+ device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
30
+
31
  def pil_to_binary_mask(pil_image, threshold=0):
32
  np_image = np.array(pil_image)
33
  grayscale_image = Image.fromarray(np_image).convert("L")
34
  binary_mask = np.array(grayscale_image) > threshold
35
  mask = np.zeros(binary_mask.shape, dtype=np.uint8)
36
+ for i in range(binary_mask.shape[0]):
37
+ for j in range(binary_mask.shape[1]):
38
+ if binary_mask[i,j] == True :
39
+ mask[i,j] = 1
40
+ mask = (mask*255).astype(np.uint8)
41
  output_mask = Image.fromarray(mask)
42
  return output_mask
43
 
44
+
45
  base_path = 'yisol/IDM-VTON'
46
  example_path = os.path.join(os.path.dirname(__file__), 'example')
47
 
 
48
  unet = UNet2DConditionModel.from_pretrained(
49
  base_path,
50
  subfolder="unet",
51
  torch_dtype=torch.float16,
52
  )
53
  unet.requires_grad_(False)
 
54
  tokenizer_one = AutoTokenizer.from_pretrained(
55
  base_path,
56
  subfolder="tokenizer",
 
63
  revision=None,
64
  use_fast=False,
65
  )
 
66
  noise_scheduler = DDPMScheduler.from_pretrained(base_path, subfolder="scheduler")
67
 
68
  text_encoder_one = CLIPTextModel.from_pretrained(
 
79
  base_path,
80
  subfolder="image_encoder",
81
  torch_dtype=torch.float16,
82
+ )
83
+ vae = AutoencoderKL.from_pretrained(base_path,
84
+ subfolder="vae",
85
+ torch_dtype=torch.float16,
86
  )
87
 
88
+ # "stabilityai/stable-diffusion-xl-base-1.0",
 
89
  UNet_Encoder = UNet2DConditionModel_ref.from_pretrained(
90
  base_path,
91
  subfolder="unet_encoder",
 
101
  unet.requires_grad_(False)
102
  text_encoder_one.requires_grad_(False)
103
  text_encoder_two.requires_grad_(False)
104
+ tensor_transfrom = transforms.Compose(
105
+ [
106
+ transforms.ToTensor(),
107
+ transforms.Normalize([0.5], [0.5]),
108
+ ]
109
+ )
 
110
 
111
  pipe = TryonPipeline.from_pretrained(
112
+ base_path,
113
+ unet=unet,
114
+ vae=vae,
115
+ feature_extractor= CLIPImageProcessor(),
116
+ text_encoder = text_encoder_one,
117
+ text_encoder_2 = text_encoder_two,
118
+ tokenizer = tokenizer_one,
119
+ tokenizer_2 = tokenizer_two,
120
+ scheduler = noise_scheduler,
121
+ image_encoder=image_encoder,
122
+ torch_dtype=torch.float16,
123
  )
124
  pipe.unet_encoder = UNet_Encoder
125
 
126
+ def start_tryon(dict,garm_img,garment_des,is_checked,is_checked_crop,denoise_steps,seed):
 
 
127
 
128
  openpose_model.preprocessor.body_estimation.model.to(device)
129
  pipe.to(device)
130
  pipe.unet_encoder.to(device)
131
 
132
+ garm_img= garm_img.convert("RGB").resize((768,1024))
133
+ human_img_orig = dict["background"].convert("RGB")
134
+
135
  if is_checked_crop:
136
  width, height = human_img_orig.size
137
  target_width = int(min(width, height * (3 / 4)))
 
142
  bottom = (height + target_height) / 2
143
  cropped_img = human_img_orig.crop((left, top, right, bottom))
144
  crop_size = cropped_img.size
145
+ human_img = cropped_img.resize((768,1024))
146
  else:
147
+ human_img = human_img_orig.resize((768,1024))
148
+
149
 
150
  if is_checked:
151
+ keypoints = openpose_model(human_img.resize((384,512)))
152
+ model_parse, _ = parsing_model(human_img.resize((384,512)))
153
  mask, mask_gray = get_mask_location('hd', "upper_body", model_parse, keypoints)
154
+ mask = mask.resize((768,1024))
155
  else:
156
+ mask = pil_to_binary_mask(dict['layers'][0].convert("RGB").resize((768, 1024)))
157
+ # mask = transforms.ToTensor()(mask)
158
+ # mask = mask.unsqueeze(0)
159
+ mask_gray = (1-transforms.ToTensor()(mask)) * tensor_transfrom(human_img)
160
+ mask_gray = to_pil_image((mask_gray+1.0)/2.0)
161
 
 
 
162
 
163
+ human_img_arg = _apply_exif_orientation(human_img.resize((384,512)))
164
  human_img_arg = convert_PIL_to_numpy(human_img_arg, format="BGR")
165
+
166
+
167
 
168
  args = apply_net.create_argument_parser().parse_args(('show', './configs/densepose_rcnn_R_50_FPN_s1x.yaml', './ckpt/densepose/model_final_162be9.pkl', 'dp_segm', '-v', '--opts', 'MODEL.DEVICE', 'cuda'))
169
+ # verbosity = getattr(args, "verbosity", None)
170
+ pose_img = args.func(args,human_img_arg)
171
+ pose_img = pose_img[:,:,::-1]
172
+ pose_img = Image.fromarray(pose_img).resize((768,1024))
173
+
174
+ with torch.no_grad():
175
+ # Extract the images
176
+ with torch.cuda.amp.autocast():
177
+ with torch.no_grad():
178
+ prompt = "model is wearing " + garment_des
 
 
 
 
 
 
 
 
 
 
 
 
179
  negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
 
 
 
 
180
  with torch.inference_mode():
181
  (
182
+ prompt_embeds,
183
+ negative_prompt_embeds,
184
+ pooled_prompt_embeds,
185
+ negative_pooled_prompt_embeds,
186
  ) = pipe.encode_prompt(
187
  prompt,
188
  num_images_per_prompt=1,
189
+ do_classifier_free_guidance=True,
190
  negative_prompt=negative_prompt,
191
  )
192
+
193
+ prompt = "a photo of " + garment_des
194
+ negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
195
+ if not isinstance(prompt, List):
196
+ prompt = [prompt] * 1
197
+ if not isinstance(negative_prompt, List):
198
+ negative_prompt = [negative_prompt] * 1
199
+ with torch.inference_mode():
200
+ (
201
+ prompt_embeds_c,
202
+ _,
203
+ _,
204
+ _,
205
+ ) = pipe.encode_prompt(
206
+ prompt,
207
+ num_images_per_prompt=1,
208
+ do_classifier_free_guidance=False,
209
+ negative_prompt=negative_prompt,
210
+ )
211
+
212
+
213
+
214
+ pose_img = tensor_transfrom(pose_img).unsqueeze(0).to(device,torch.float16)
215
+ garm_tensor = tensor_transfrom(garm_img).unsqueeze(0).to(device,torch.float16)
216
+ generator = torch.Generator(device).manual_seed(seed) if seed is not None else None
217
+ images = pipe(
218
+ prompt_embeds=prompt_embeds.to(device,torch.float16),
219
+ negative_prompt_embeds=negative_prompt_embeds.to(device,torch.float16),
220
+ pooled_prompt_embeds=pooled_prompt_embeds.to(device,torch.float16),
221
+ negative_pooled_prompt_embeds=negative_pooled_prompt_embeds.to(device,torch.float16),
222
+ num_inference_steps=denoise_steps,
223
+ generator=generator,
224
+ strength = 1.0,
225
+ pose_img = pose_img.to(device,torch.float16),
226
+ text_embeds_cloth=prompt_embeds_c.to(device,torch.float16),
227
+ cloth = garm_tensor.to(device,torch.float16),
228
+ mask_image=mask,
229
+ image=human_img,
230
+ height=1024,
231
+ width=768,
232
+ ip_adapter_image = garm_img.resize((768,1024)),
233
+ guidance_scale=2.0,
234
+ )[0]
235
 
236
  if is_checked_crop:
237
+ out_img = images[0].resize(crop_size)
238
+ human_img_orig.paste(out_img, (int(left), int(top)))
239
  return human_img_orig, mask_gray
240
  else:
241
+ return images[0], mask_gray
242
+ # return images[0], mask_gray
243
 
244
+ garm_list = os.listdir(os.path.join(example_path,"cloth"))
245
+ garm_list_path = [os.path.join(example_path,"cloth",garm) for garm in garm_list]
246
 
247
+ human_list = os.listdir(os.path.join(example_path,"human"))
248
+ human_list_path = [os.path.join(example_path,"human",human) for human in human_list]
249
 
250
  human_ex_list = []
251
  for ex_human in human_list_path:
252
+ ex_dict= {}
253
  ex_dict['background'] = ex_human
254
  ex_dict['layers'] = None
255
  ex_dict['composite'] = None
256
  human_ex_list.append(ex_dict)
257
 
258
+ ##default human
259
+
260
 
261
  image_blocks = gr.Blocks().queue()
262
  with image_blocks as demo:
 
266
  with gr.Column():
267
  imgs = gr.ImageEditor(sources='upload', type="pil", label='Human. Mask with pen or use auto-masking', interactive=True)
268
  with gr.Row():
269
+ is_checked = gr.Checkbox(label="Yes", info="Use auto-generated mask (Takes 5 seconds)",value=True)
270
  with gr.Row():
271
+ is_checked_crop = gr.Checkbox(label="Yes", info="Use auto-crop & resizing",value=False)
272
 
273
  example = gr.Examples(
274
  inputs=imgs,
 
287
  examples=garm_list_path)
288
  with gr.Column():
289
  # image_out = gr.Image(label="Output", elem_id="output-img", height=400)
290
+ masked_img = gr.Image(label="Masked image output", elem_id="masked-img",show_share_button=False)
291
  with gr.Column():
292
  # image_out = gr.Image(label="Output", elem_id="output-img", height=400)
293
+ image_out = gr.Image(label="Output", elem_id="output-img",show_share_button=False)
294
+
295
+
296
+
297
 
298
  with gr.Column():
299
  try_button = gr.Button(value="Try-on")
 
302
  denoise_steps = gr.Number(label="Denoising Steps", minimum=20, maximum=40, value=30, step=1)
303
  seed = gr.Number(label="Seed", minimum=-1, maximum=2147483647, step=1, value=42)
304
 
 
305
 
306
+
307
+ try_button.click(fn=start_tryon, inputs=[imgs, garm_img, prompt, is_checked,is_checked_crop, denoise_steps, seed], outputs=[image_out,masked_img], api_name='tryon')
308
+
309
+
310
+
311
+
312
+ image_blocks.launch()