fantos commited on
Commit
f2a614f
·
verified ·
1 Parent(s): b4134a4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -244
app.py CHANGED
@@ -5,7 +5,7 @@ import cv2
5
  import gradio as gr
6
  import numpy as np
7
  from huggingface_hub import snapshot_download
8
- from transformers import CLIPVisionModelWithProjection, CLIPImageProcessor, pipeline
9
  from diffusers.utils import load_image
10
  from kolors.pipelines.pipeline_controlnet_xl_kolors_img2img import StableDiffusionXLControlNetImg2ImgPipeline
11
  from kolors.models.modeling_chatglm import ChatGLMModel
@@ -15,16 +15,11 @@ from diffusers import AutoencoderKL
15
  from kolors.models.unet_2d_condition import UNet2DConditionModel
16
  from diffusers import EulerDiscreteScheduler
17
  from PIL import Image, ImageDraw, ImageFont
18
- from annotator.midas import MidasDetector
19
- from annotator.dwpose import DWposeDetector
20
- from annotator.util import resize_image, HWC3
21
  import os
22
 
23
  device = "cuda"
24
  ckpt_dir = snapshot_download(repo_id="Kwai-Kolors/Kolors")
25
- ckpt_dir_depth = snapshot_download(repo_id="Kwai-Kolors/Kolors-ControlNet-Depth")
26
  ckpt_dir_canny = snapshot_download(repo_id="Kwai-Kolors/Kolors-ControlNet-Canny")
27
- ckpt_dir_pose = snapshot_download(repo_id="Kwai-Kolors/Kolors-ControlNet-Pose")
28
 
29
  # Add translation pipeline
30
  translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en")
@@ -34,19 +29,7 @@ tokenizer = ChatGLMTokenizer.from_pretrained(f'{ckpt_dir}/text_encoder')
34
  vae = AutoencoderKL.from_pretrained(f"{ckpt_dir}/vae", revision=None).half().to(device)
35
  scheduler = EulerDiscreteScheduler.from_pretrained(f"{ckpt_dir}/scheduler")
36
  unet = UNet2DConditionModel.from_pretrained(f"{ckpt_dir}/unet", revision=None).half().to(device)
37
- controlnet_depth = ControlNetModel.from_pretrained(f"{ckpt_dir_depth}", revision=None).half().to(device)
38
  controlnet_canny = ControlNetModel.from_pretrained(f"{ckpt_dir_canny}", revision=None).half().to(device)
39
- controlnet_pose = ControlNetModel.from_pretrained(f"{ckpt_dir_pose}", revision=None).half().to(device)
40
-
41
- pipe_depth = StableDiffusionXLControlNetImg2ImgPipeline(
42
- vae=vae,
43
- controlnet=controlnet_depth,
44
- text_encoder=text_encoder,
45
- tokenizer=tokenizer,
46
- unet=unet,
47
- scheduler=scheduler,
48
- force_zeros_for_empty_prompt=False
49
- )
50
 
51
  pipe_canny = StableDiffusionXLControlNetImg2ImgPipeline(
52
  vae=vae,
@@ -58,16 +41,6 @@ pipe_canny = StableDiffusionXLControlNetImg2ImgPipeline(
58
  force_zeros_for_empty_prompt=False
59
  )
60
 
61
- pipe_pose = StableDiffusionXLControlNetImg2ImgPipeline(
62
- vae=vae,
63
- controlnet=controlnet_pose,
64
- text_encoder=text_encoder,
65
- tokenizer=tokenizer,
66
- unet=unet,
67
- scheduler=scheduler,
68
- force_zeros_for_empty_prompt=False
69
- )
70
-
71
  @spaces.GPU
72
  def translate_korean_to_english(text):
73
  if any(ord(char) >= 0xAC00 and ord(char) <= 0xD7A3 for char in text): # Check if Korean characters are present
@@ -84,150 +57,10 @@ def process_canny_condition(image, canny_threods=[100,200]):
84
  np_image = HWC3(np_image)
85
  return Image.fromarray(np_image)
86
 
87
- model_midas = MidasDetector()
88
- @spaces.GPU
89
- def process_depth_condition_midas(img, res = 1024):
90
- h,w,_ = img.shape
91
- img = resize_image(HWC3(img), res)
92
- result = HWC3(model_midas(img))
93
- result = cv2.resize(result, (w,h))
94
- return Image.fromarray(result)
95
-
96
- model_dwpose = DWposeDetector()
97
- @spaces.GPU
98
- def process_dwpose_condition(image, res=1024):
99
- h,w,_ = image.shape
100
- img = resize_image(HWC3(image), res)
101
- out_res, out_img = model_dwpose(image)
102
- result = HWC3(out_img)
103
- result = cv2.resize(result, (w,h))
104
- return Image.fromarray(result)
105
-
106
  MAX_SEED = np.iinfo(np.int32).max
107
  MAX_IMAGE_SIZE = 1024
108
 
109
- @spaces.GPU
110
- def infer_depth(prompt,
111
- image = None,
112
- negative_prompt = "nsfw, facial shadows, low resolution, jpeg artifacts, blurry, bad quality, dark face, neon lights",
113
- seed = 397886929,
114
- randomize_seed = False,
115
- guidance_scale = 6.0,
116
- num_inference_steps = 50,
117
- controlnet_conditioning_scale = 0.7,
118
- control_guidance_end = 0.9,
119
- strength = 1.0
120
- ):
121
- prompt = translate_korean_to_english(prompt)
122
- negative_prompt = translate_korean_to_english(negative_prompt)
123
-
124
- if randomize_seed:
125
- seed = random.randint(0, MAX_SEED)
126
- generator = torch.Generator().manual_seed(seed)
127
- init_image = resize_image(image, MAX_IMAGE_SIZE)
128
- pipe = pipe_depth.to("cuda")
129
- condi_img = process_depth_condition_midas(np.array(init_image), MAX_IMAGE_SIZE)
130
- image = pipe(
131
- prompt=prompt,
132
- image=init_image,
133
- controlnet_conditioning_scale=controlnet_conditioning_scale,
134
- control_guidance_end=control_guidance_end,
135
- strength=strength,
136
- control_image=condi_img,
137
- negative_prompt=negative_prompt,
138
- num_inference_steps=num_inference_steps,
139
- guidance_scale=guidance_scale,
140
- num_images_per_prompt=1,
141
- generator=generator,
142
- ).images[0]
143
- return [condi_img, image], seed
144
-
145
- @spaces.GPU
146
- def infer_canny(prompt,
147
- image = None,
148
- negative_prompt = "nsfw, facial shadows, low resolution, jpeg artifacts, blurry, bad quality, dark face, neon lights",
149
- seed = 397886929,
150
- randomize_seed = False,
151
- guidance_scale = 6.0,
152
- num_inference_steps = 50,
153
- controlnet_conditioning_scale = 0.7,
154
- control_guidance_end = 0.9,
155
- strength = 1.0
156
- ):
157
- prompt = translate_korean_to_english(prompt)
158
- negative_prompt = translate_korean_to_english(negative_prompt)
159
-
160
- if randomize_seed:
161
- seed = random.randint(0, MAX_SEED)
162
- generator = torch.Generator().manual_seed(seed)
163
- init_image = resize_image(image, MAX_IMAGE_SIZE)
164
- pipe = pipe_canny.to("cuda")
165
- condi_img = process_canny_condition(np.array(init_image))
166
- image = pipe(
167
- prompt=prompt,
168
- image=init_image,
169
- controlnet_conditioning_scale=controlnet_conditioning_scale,
170
- control_guidance_end=control_guidance_end,
171
- strength=strength,
172
- control_image=condi_img,
173
- negative_prompt=negative_prompt,
174
- num_inference_steps=num_inference_steps,
175
- guidance_scale=guidance_scale,
176
- num_images_per_prompt=1,
177
- generator=generator,
178
- ).images[0]
179
- return [condi_img, image], seed
180
-
181
- @spaces.GPU
182
- def infer_pose(prompt,
183
- image = None,
184
- negative_prompt = "nsfw, facial shadows, low resolution, jpeg artifacts, blurry, bad quality, dark face, neon lights",
185
- seed = 66,
186
- randomize_seed = False,
187
- guidance_scale = 6.0,
188
- num_inference_steps = 50,
189
- controlnet_conditioning_scale = 0.7,
190
- control_guidance_end = 0.9,
191
- strength = 1.0
192
- ):
193
- prompt = translate_korean_to_english(prompt)
194
- negative_prompt = translate_korean_to_english(negative_prompt)
195
-
196
- if randomize_seed:
197
- seed = random.randint(0, MAX_SEED)
198
- generator = torch.Generator().manual_seed(seed)
199
- init_image = resize_image(image, MAX_IMAGE_SIZE)
200
- pipe = pipe_pose.to("cuda")
201
- condi_img = process_dwpose_condition(np.array(init_image), MAX_IMAGE_SIZE)
202
- image = pipe(
203
- prompt=prompt,
204
- image=init_image,
205
- controlnet_conditioning_scale=controlnet_conditioning_scale,
206
- control_guidance_end=control_guidance_end,
207
- strength=strength,
208
- control_image=condi_img,
209
- negative_prompt=negative_prompt,
210
- num_inference_steps=num_inference_steps,
211
- guidance_scale=guidance_scale,
212
- num_images_per_prompt=1,
213
- generator=generator,
214
- ).images[0]
215
- return [condi_img, image], seed
216
-
217
- css = """
218
- footer {
219
- visibility: hidden;
220
- }
221
- """
222
-
223
-
224
- def load_description(fp):
225
- with open(fp, 'r', encoding='utf-8') as f:
226
- content = f.read()
227
- return content
228
-
229
- # Add the text_to_image function
230
- def text_to_image(text, size, position):
231
  width, height = 1024, 576
232
  image = Image.new("RGB", (width, height), "white")
233
  draw = ImageDraw.Draw(image)
@@ -278,6 +111,51 @@ def text_to_image(text, size, position):
278
 
279
  return image
280
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
  with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as Kolors:
282
  with gr.Row():
283
  with gr.Column(elem_id="col-left"):
@@ -287,21 +165,6 @@ with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as Kolors:
287
  placeholder="Enter your prompt",
288
  lines=2
289
  )
290
- with gr.Row():
291
- image_input_type = gr.Radio(["Upload Image", "Generate Text Image"], label="Input Type", value="Upload Image")
292
-
293
- with gr.Row():
294
- image = gr.Image(label="Image", type="pil", visible=True)
295
- with gr.Column(visible=False) as text_image_inputs:
296
- text_input = gr.Textbox(label="Enter Text", lines=5, placeholder="Type your text here...")
297
- font_size = gr.Radio([48, 72, 96, 144], label="Font Size", value=72)
298
- text_position = gr.Dropdown(
299
- ["top-left", "top-center", "top-right", "middle-left", "middle-center", "middle-right", "bottom-left", "bottom-center", "bottom-right"],
300
- label="Text Position",
301
- value="middle-center"
302
- )
303
- generate_text_image = gr.Button("Generate Text Image")
304
-
305
  with gr.Accordion("Advanced Settings", open=False):
306
  negative_prompt = gr.Textbox(
307
  label="Negative prompt",
@@ -357,73 +220,15 @@ with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as Kolors:
357
  )
358
  with gr.Row():
359
  canny_button = gr.Button("Canny", elem_id="button")
360
- depth_button = gr.Button("Depth", elem_id="button")
361
- pose_button = gr.Button("Pose", elem_id="button")
362
 
363
  with gr.Column(elem_id="col-right"):
364
  result = gr.Gallery(label="Result", show_label=False, columns=2)
365
  seed_used = gr.Number(label="Seed Used")
366
 
367
- def toggle_image_input(choice):
368
- return {
369
- image: gr.update(visible=choice == "Upload Image"),
370
- text_image_inputs: gr.update(visible=choice == "Generate Text Image")
371
- }
372
-
373
- image_input_type.change(toggle_image_input, image_input_type, [image, text_image_inputs])
374
-
375
- def generate_and_use_text_image(text, size, position):
376
- text_image = text_to_image(text, size, position)
377
- return text_image
378
-
379
- generate_text_image.click(
380
- generate_and_use_text_image,
381
- inputs=[text_input, font_size, text_position],
382
- outputs=image
383
- )
384
-
385
- with gr.Row():
386
- gr.Examples(
387
- fn = infer_canny,
388
- examples = canny_examples,
389
- inputs = [prompt, image],
390
- outputs = [result, seed_used],
391
- label = "Canny"
392
- )
393
- with gr.Row():
394
- gr.Examples(
395
- fn = infer_depth,
396
- examples = depth_examples,
397
- inputs = [prompt, image],
398
- outputs = [result, seed_used],
399
- label = "Depth"
400
- )
401
-
402
- with gr.Row():
403
- gr.Examples(
404
- fn = infer_pose,
405
- examples = pose_examples,
406
- inputs = [prompt, image],
407
- outputs = [result, seed_used],
408
- label = "Pose"
409
- )
410
-
411
  canny_button.click(
412
  fn = infer_canny,
413
- inputs = [prompt, image, negative_prompt, seed, randomize_seed, guidance_scale, num_inference_steps, controlnet_conditioning_scale, control_guidance_end, strength],
414
- outputs = [result, seed_used]
415
- )
416
-
417
- depth_button.click(
418
- fn = infer_depth,
419
- inputs = [prompt, image, negative_prompt, seed, randomize_seed, guidance_scale, num_inference_steps, controlnet_conditioning_scale, control_guidance_end, strength],
420
- outputs = [result, seed_used]
421
- )
422
-
423
- pose_button.click(
424
- fn = infer_pose,
425
- inputs = [prompt, image, negative_prompt, seed, randomize_seed, guidance_scale, num_inference_steps, controlnet_conditioning_scale, control_guidance_end, strength],
426
  outputs = [result, seed_used]
427
  )
428
 
429
- Kolors.queue().launch(debug=True)
 
5
  import gradio as gr
6
  import numpy as np
7
  from huggingface_hub import snapshot_download
8
+ from transformers import pipeline
9
  from diffusers.utils import load_image
10
  from kolors.pipelines.pipeline_controlnet_xl_kolors_img2img import StableDiffusionXLControlNetImg2ImgPipeline
11
  from kolors.models.modeling_chatglm import ChatGLMModel
 
15
  from kolors.models.unet_2d_condition import UNet2DConditionModel
16
  from diffusers import EulerDiscreteScheduler
17
  from PIL import Image, ImageDraw, ImageFont
 
 
 
18
  import os
19
 
20
  device = "cuda"
21
  ckpt_dir = snapshot_download(repo_id="Kwai-Kolors/Kolors")
 
22
  ckpt_dir_canny = snapshot_download(repo_id="Kwai-Kolors/Kolors-ControlNet-Canny")
 
23
 
24
  # Add translation pipeline
25
  translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en")
 
29
  vae = AutoencoderKL.from_pretrained(f"{ckpt_dir}/vae", revision=None).half().to(device)
30
  scheduler = EulerDiscreteScheduler.from_pretrained(f"{ckpt_dir}/scheduler")
31
  unet = UNet2DConditionModel.from_pretrained(f"{ckpt_dir}/unet", revision=None).half().to(device)
 
32
  controlnet_canny = ControlNetModel.from_pretrained(f"{ckpt_dir_canny}", revision=None).half().to(device)
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  pipe_canny = StableDiffusionXLControlNetImg2ImgPipeline(
35
  vae=vae,
 
41
  force_zeros_for_empty_prompt=False
42
  )
43
 
 
 
 
 
 
 
 
 
 
 
44
  @spaces.GPU
45
  def translate_korean_to_english(text):
46
  if any(ord(char) >= 0xAC00 and ord(char) <= 0xD7A3 for char in text): # Check if Korean characters are present
 
57
  np_image = HWC3(np_image)
58
  return Image.fromarray(np_image)
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  MAX_SEED = np.iinfo(np.int32).max
61
  MAX_IMAGE_SIZE = 1024
62
 
63
+ def text_to_image(text, size=72, position="middle-center"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  width, height = 1024, 576
65
  image = Image.new("RGB", (width, height), "white")
66
  draw = ImageDraw.Draw(image)
 
111
 
112
  return image
113
 
114
+ @spaces.GPU
115
+ def infer_canny(prompt,
116
+ negative_prompt = "nsfw, facial shadows, low resolution, jpeg artifacts, blurry, bad quality, dark face, neon lights",
117
+ seed = 397886929,
118
+ randomize_seed = False,
119
+ guidance_scale = 6.0,
120
+ num_inference_steps = 50,
121
+ controlnet_conditioning_scale = 0.7,
122
+ control_guidance_end = 0.9,
123
+ strength = 1.0
124
+ ):
125
+ prompt = translate_korean_to_english(prompt)
126
+ negative_prompt = translate_korean_to_english(negative_prompt)
127
+
128
+ if randomize_seed:
129
+ seed = random.randint(0, MAX_SEED)
130
+ generator = torch.Generator().manual_seed(seed)
131
+
132
+ # Generate text image
133
+ init_image = text_to_image(prompt)
134
+ init_image = resize_image(init_image, MAX_IMAGE_SIZE)
135
+
136
+ pipe = pipe_canny.to("cuda")
137
+ condi_img = process_canny_condition(np.array(init_image))
138
+ image = pipe(
139
+ prompt=prompt,
140
+ image=init_image,
141
+ controlnet_conditioning_scale=controlnet_conditioning_scale,
142
+ control_guidance_end=control_guidance_end,
143
+ strength=strength,
144
+ control_image=condi_img,
145
+ negative_prompt=negative_prompt,
146
+ num_inference_steps=num_inference_steps,
147
+ guidance_scale=guidance_scale,
148
+ num_images_per_prompt=1,
149
+ generator=generator,
150
+ ).images[0]
151
+ return [condi_img, image], seed
152
+
153
+ css = """
154
+ footer {
155
+ visibility: hidden;
156
+ }
157
+ """
158
+
159
  with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as Kolors:
160
  with gr.Row():
161
  with gr.Column(elem_id="col-left"):
 
165
  placeholder="Enter your prompt",
166
  lines=2
167
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  with gr.Accordion("Advanced Settings", open=False):
169
  negative_prompt = gr.Textbox(
170
  label="Negative prompt",
 
220
  )
221
  with gr.Row():
222
  canny_button = gr.Button("Canny", elem_id="button")
 
 
223
 
224
  with gr.Column(elem_id="col-right"):
225
  result = gr.Gallery(label="Result", show_label=False, columns=2)
226
  seed_used = gr.Number(label="Seed Used")
227
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  canny_button.click(
229
  fn = infer_canny,
230
+ inputs = [prompt, negative_prompt, seed, randomize_seed, guidance_scale, num_inference_steps, controlnet_conditioning_scale, control_guidance_end, strength],
 
 
 
 
 
 
 
 
 
 
 
 
231
  outputs = [result, seed_used]
232
  )
233
 
234
+ Kolors.queue().launch(debug=True)