EveryText

Building

File size: 14,078 Bytes

d5f497d
 
 
6c91ee7
 
 
d5f497d
0bf993c
6c91ee7
 
d5f497d
 
6c91ee7
0bf993c
6c91ee7
 
 
 
3ad3d31
6c91ee7
 
d5f497d
 
6c91ee7
 
3ad3d31
d5f497d
0bf993c
 
 
d5f497d
 
 
 
6c91ee7
 
 
3ad3d31
d5f497d
6c91ee7
d5f497d
0bf993c
6c91ee7
 
 
 
d5f497d
6c91ee7
d5f497d
6c91ee7
d5f497d
0bf993c
d5f497d
 
6c91ee7
d5f497d
 
6c91ee7
d5f497d
3ad3d31
 
0bf993c
3ad3d31
 
 
 
 
 
 
0bf993c
 
 
 
 
 
 
6c91ee7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d5f497d
3ad3d31
 
 
 
 
3936987
 
0bf993c
3ad3d31
 
d5f497d
8004741
d5f497d
 
e9f3ef9
6c91ee7
0bf993c
8f532a7
6c91ee7
 
 
 
 
 
 
0bf993c
 
 
d5f497d
 
 
0bf993c
e9f3ef9
0bf993c
6c91ee7
0bf993c
 
 
 
 
 
 
 
 
6c91ee7
 
 
9de30d4
cd4f227
e9f3ef9
 
 
0bf993c
6155537
e9f3ef9
 
 
 
 
 
 
0bf993c
 
 
e9f3ef9
 
 
0bf993c
e9f3ef9
 
 
0bf993c
 
 
 
 
 
 
 
 
e9f3ef9
 
 
9de30d4
fad18b4
3ad3d31
 
 
0bf993c
595a73a
3ad3d31
 
 
 
 
 
 
0bf993c
 
 
3ad3d31
 
 
0bf993c
595a73a
3ad3d31
 
0bf993c
 
 
 
 
 
 
 
 
3ad3d31
 
 
 
 
78ad020
0bf993c
fad18b4
0bf993c
fad18b4
78ad020
 
 
0bf993c
fad18b4
0bf993c
fad18b4
d5f497d
 
3ad3d31
0bf993c
0fb30ab
0bf993c
0fb30ab
3ad3d31
 
d5f497d
 
 
d890da3
d5f497d
 
 
 
 
20c2217
83bde13
20c2217
d5f497d
 
f92dc60
 
 
 
 
 
d5f497d
 
 
 
 
 
d890da3
d5f497d
6c91ee7
 
d5f497d
 
 
 
 
0bf993c
d5f497d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c91ee7
d5f497d
 
 
 
 
 
6c91ee7
d5f497d
 
6c91ee7
 
d5f497d
 
6c91ee7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d5f497d
78ad020
20c2217
 
3ad3d31
d5f497d
 
6c91ee7
9de30d4
d5f497d
 
 
e9f3ef9
78ad020
fad18b4
7132521
78ad020
 
 
 
e9f3ef9
78ad020
fad18b4
7132521
78ad020
d5f497d
3ad3d31
 
 
 
 
 
 
 
 
d5f497d
78ad020
e9f3ef9
20c2217
9de30d4
78ad020
 
 
e9f3ef9
20c2217
9de30d4
78ad020
 
3ad3d31
 
 
 
 
 
0bf993c

import spaces
import random
import torch
import cv2
import gradio as gr
import numpy as np
from huggingface_hub import snapshot_download
from transformers import CLIPVisionModelWithProjection, CLIPImageProcessor, pipeline
from diffusers.utils import load_image
from kolors.pipelines.pipeline_controlnet_xl_kolors_img2img import StableDiffusionXLControlNetImg2ImgPipeline
from kolors.models.modeling_chatglm import ChatGLMModel
from kolors.models.tokenization_chatglm import ChatGLMTokenizer
from kolors.models.controlnet import ControlNetModel
from diffusers import AutoencoderKL
from kolors.models.unet_2d_condition import UNet2DConditionModel
from diffusers import EulerDiscreteScheduler
from PIL import Image
from annotator.midas import MidasDetector
from annotator.dwpose import DWposeDetector
from annotator.util import resize_image, HWC3

device = "cuda"
ckpt_dir = snapshot_download(repo_id="Kwai-Kolors/Kolors")
ckpt_dir_depth = snapshot_download(repo_id="Kwai-Kolors/Kolors-ControlNet-Depth")
ckpt_dir_canny = snapshot_download(repo_id="Kwai-Kolors/Kolors-ControlNet-Canny")
ckpt_dir_pose = snapshot_download(repo_id="Kwai-Kolors/Kolors-ControlNet-Pose")

# Add translation pipeline
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en")

text_encoder = ChatGLMModel.from_pretrained(f'{ckpt_dir}/text_encoder', torch_dtype=torch.float16).half().to(device)
tokenizer = ChatGLMTokenizer.from_pretrained(f'{ckpt_dir}/text_encoder')
vae = AutoencoderKL.from_pretrained(f"{ckpt_dir}/vae", revision=None).half().to(device)
scheduler = EulerDiscreteScheduler.from_pretrained(f"{ckpt_dir}/scheduler")
unet = UNet2DConditionModel.from_pretrained(f"{ckpt_dir}/unet", revision=None).half().to(device)
controlnet_depth = ControlNetModel.from_pretrained(f"{ckpt_dir_depth}", revision=None).half().to(device)
controlnet_canny = ControlNetModel.from_pretrained(f"{ckpt_dir_canny}", revision=None).half().to(device)
controlnet_pose = ControlNetModel.from_pretrained(f"{ckpt_dir_pose}", revision=None).half().to(device)

pipe_depth = StableDiffusionXLControlNetImg2ImgPipeline(
    vae=vae,
    controlnet=controlnet_depth,
    text_encoder=text_encoder,
    tokenizer=tokenizer,
    unet=unet,
    scheduler=scheduler,
    force_zeros_for_empty_prompt=False
)

pipe_canny = StableDiffusionXLControlNetImg2ImgPipeline(
    vae=vae,
    controlnet=controlnet_canny,
    text_encoder=text_encoder,
    tokenizer=tokenizer,
    unet=unet,
    scheduler=scheduler,
    force_zeros_for_empty_prompt=False
)

pipe_pose = StableDiffusionXLControlNetImg2ImgPipeline(
    vae=vae,
    controlnet=controlnet_pose,
    text_encoder=text_encoder,
    tokenizer=tokenizer,
    unet=unet,
    scheduler=scheduler,
    force_zeros_for_empty_prompt=False
)

@spaces.GPU
def translate_korean_to_english(text):
    if any(ord(char) >= 0xAC00 and ord(char) <= 0xD7A3 for char in text):  # Check if Korean characters are present
        translated = translator(text, max_length=512)[0]['translation_text']
        return translated
    return text

@spaces.GPU
def process_canny_condition(image, canny_threods=[100,200]):
    np_image = image.copy()
    np_image = cv2.Canny(np_image, canny_threods[0], canny_threods[1])
    np_image = np_image[:, :, None]
    np_image = np.concatenate([np_image, np_image, np_image], axis=2)
    np_image = HWC3(np_image)
    return Image.fromarray(np_image)

model_midas = MidasDetector()
@spaces.GPU
def process_depth_condition_midas(img, res = 1024):
    h,w,_ = img.shape
    img = resize_image(HWC3(img), res)
    result = HWC3(model_midas(img))
    result = cv2.resize(result, (w,h))
    return Image.fromarray(result)

model_dwpose = DWposeDetector()
@spaces.GPU
def process_dwpose_condition(image, res=1024):
    h,w,_ = image.shape
    img = resize_image(HWC3(image), res)
    out_res, out_img = model_dwpose(image) 
    result = HWC3(out_img)
    result = cv2.resize(result, (w,h))
    return Image.fromarray(result)

MAX_SEED = np.iinfo(np.int32).max
MAX_IMAGE_SIZE = 1024

@spaces.GPU
def infer_depth(prompt, 
          image = None, 
          negative_prompt = "nsfw, facial shadows, low resolution, jpeg artifacts, blurry, bad quality, dark face, neon lights", 
          seed = 397886929, 
          randomize_seed = False,
          guidance_scale = 6.0, 
          num_inference_steps = 50,
          controlnet_conditioning_scale = 0.7,
          control_guidance_end = 0.9,
          strength = 1.0
        ):
    prompt = translate_korean_to_english(prompt)
    negative_prompt = translate_korean_to_english(negative_prompt)
    
    if randomize_seed:
        seed = random.randint(0, MAX_SEED)
    generator = torch.Generator().manual_seed(seed)
    init_image = resize_image(image, MAX_IMAGE_SIZE)
    pipe = pipe_depth.to("cuda")
    condi_img = process_depth_condition_midas(np.array(init_image), MAX_IMAGE_SIZE)
    image = pipe(
        prompt=prompt,
        image=init_image,
        controlnet_conditioning_scale=controlnet_conditioning_scale,
        control_guidance_end=control_guidance_end, 
        strength=strength, 
        control_image=condi_img,
        negative_prompt=negative_prompt, 
        num_inference_steps=num_inference_steps, 
        guidance_scale=guidance_scale,
        num_images_per_prompt=1,
        generator=generator,
    ).images[0]
    return [condi_img, image], seed

@spaces.GPU
def infer_canny(prompt, 
          image = None, 
          negative_prompt = "nsfw, facial shadows, low resolution, jpeg artifacts, blurry, bad quality, dark face, neon lights", 
          seed = 397886929, 
          randomize_seed = False,
          guidance_scale = 6.0, 
          num_inference_steps = 50,
          controlnet_conditioning_scale = 0.7,
          control_guidance_end = 0.9,
          strength = 1.0
        ):
    prompt = translate_korean_to_english(prompt)
    negative_prompt = translate_korean_to_english(negative_prompt)
    
    if randomize_seed:
        seed = random.randint(0, MAX_SEED)
    generator = torch.Generator().manual_seed(seed)
    init_image = resize_image(image, MAX_IMAGE_SIZE)
    pipe = pipe_canny.to("cuda")
    condi_img = process_canny_condition(np.array(init_image))
    image = pipe(
        prompt=prompt,
        image=init_image,
        controlnet_conditioning_scale=controlnet_conditioning_scale,
        control_guidance_end=control_guidance_end, 
        strength=strength, 
        control_image=condi_img,
        negative_prompt=negative_prompt, 
        num_inference_steps=num_inference_steps, 
        guidance_scale=guidance_scale,
        num_images_per_prompt=1,
        generator=generator,
    ).images[0]
    return [condi_img, image], seed

@spaces.GPU
def infer_pose(prompt, 
          image = None, 
          negative_prompt = "nsfw, facial shadows, low resolution, jpeg artifacts, blurry, bad quality, dark face, neon lights", 
          seed = 66, 
          randomize_seed = False,
          guidance_scale = 6.0, 
          num_inference_steps = 50,
          controlnet_conditioning_scale = 0.7,
          control_guidance_end = 0.9,
          strength = 1.0
        ):
    prompt = translate_korean_to_english(prompt)
    negative_prompt = translate_korean_to_english(negative_prompt)
    
    if randomize_seed:
        seed = random.randint(0, MAX_SEED)
    generator = torch.Generator().manual_seed(seed)
    init_image = resize_image(image, MAX_IMAGE_SIZE)
    pipe = pipe_pose.to("cuda")
    condi_img = process_dwpose_condition(np.array(init_image), MAX_IMAGE_SIZE)
    image = pipe(
        prompt=prompt,
        image=init_image,
        controlnet_conditioning_scale=controlnet_conditioning_scale,
        control_guidance_end=control_guidance_end, 
        strength=strength, 
        control_image=condi_img,
        negative_prompt=negative_prompt, 
        num_inference_steps=num_inference_steps, 
        guidance_scale=guidance_scale,
        num_images_per_prompt=1,
        generator=generator,
    ).images[0]
    return [condi_img, image], seed

canny_examples = [
    ["아름다운 소녀, 고품질, 초고해상도, 생생한 색상, 최고의 품질, 8k, HD, 4K",
     "image/woman_1.png"],
    ["전경, 귀여운 흰 강아지가 컵에 앉아 카메라를 보고 있다, 애니메이션 스타일, 3D 렌더링",
    "image/dog.png"]
]

depth_examples = [
    ["신카이 마코토 스타일, 풍부한 색감, 녹색 셔츠를 입은 여성이 들판에 서 있다, 아름다운 풍경, 상쾌하고 밝은, 반짝이는 빛, 최고의 품질, 초세밀, 8K 화질",
     "image/woman_2.png"],
    ["화려한 색상의 작은 새, 고품질, 초고해상도, 생생한 색상, 최고의 품질, 8k, HD, 4K",
     "image/bird.png"]
]

pose_examples = [
    ["보라색 퍼프 소매 드레스를 입고 왕관과 흰색 레이스 장갑을 낀 소녀가 양 손으로 얼굴을 감싸고 있다, 고품질, 초고해상도, 생생한 색상, 최고의 품질, 8k, HD, 4K",
     "image/woman_3.png"],
    ["검은색 스포츠 재킷과 흰색 이너를 입고 목걸이를 한 여성이 거리에 서 있다, 배경에는 빨간 건물과 녹색 나무가 있다, 고품질, 초고해상도, 생생한 색상, 최고의 품질, 8k, HD, 4K",
     "image/woman_4.png"]
]

css="""
#col-left {
    margin: 0 auto;
    max-width: 600px;
}
#col-right {
    margin: 0 auto;
    max-width: 750px;
}
#button {
    color: blue;
}
"""

def load_description(fp):
    with open(fp, 'r', encoding='utf-8') as f:
        content = f.read()
    return content

with gr.Blocks(css=css) as Kolors:
    with gr.Row():
        with gr.Column(elem_id="col-left"):
            with gr.Row():
                prompt = gr.Textbox(
                    label="Prompt",
                    placeholder="Enter your prompt",
                    lines=2
                )
            with gr.Row():
                image = gr.Image(label="Image", type="pil")
            with gr.Accordion("Advanced Settings", open=False):
                negative_prompt = gr.Textbox(
                    label="Negative prompt",
                    placeholder="Enter a negative prompt",
                    visible=True,
                    value="nsfw, facial shadows, low resolution, jpeg artifacts, blurry, bad quality, dark face, neon lights"
                )
                seed = gr.Slider(
                    label="Seed",
                    minimum=0,
                    maximum=MAX_SEED,
                    step=1,
                    value=0,
                )
                randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
                with gr.Row():
                    guidance_scale = gr.Slider(
                        label="Guidance scale",
                        minimum=0.0,
                        maximum=10.0,
                        step=0.1,
                        value=6.0,
                    )
                    num_inference_steps = gr.Slider(
                        label="Number of inference steps",
                        minimum=10,
                        maximum=50,
                        step=1,
                        value=30,
                    )
                with gr.Row():
                    controlnet_conditioning_scale = gr.Slider(
                        label="Controlnet Conditioning Scale",
                        minimum=0.0,
                        maximum=1.0,
                        step=0.1,
                        value=0.7,
                    )
                    control_guidance_end = gr.Slider(
                        label="Control Guidance End",
                        minimum=0.0,
                        maximum=1.0,
                        step=0.1,
                        value=0.9,
                    )
                with gr.Row():
                    strength = gr.Slider(
                        label="Strength",
                        minimum=0.0,
                        maximum=1.0,
                        step=0.1,
                        value=1.0,
                    )
            with gr.Row():
                canny_button = gr.Button("Canny", elem_id="button")
                depth_button = gr.Button("Depth", elem_id="button")
                pose_button = gr.Button("Pose", elem_id="button")
            
        with gr.Column(elem_id="col-right"):
            result = gr.Gallery(label="Result", show_label=False, columns=2)
            seed_used = gr.Number(label="Seed Used")
    
    with gr.Row():
        gr.Examples(
                fn = infer_canny,
                examples = canny_examples,
                inputs = [prompt, image],
                outputs = [result, seed_used],
                label = "Canny"
            )
    with gr.Row():
        gr.Examples(
                fn = infer_depth,
                examples = depth_examples,
                inputs = [prompt, image],
                outputs = [result, seed_used],
                label = "Depth"
            )
        
    with gr.Row():
        gr.Examples(
                fn = infer_pose,
                examples = pose_examples,
                inputs = [prompt, image],
                outputs = [result, seed_used],
                label = "Pose"
            )

    canny_button.click(
        fn = infer_canny,
        inputs = [prompt, image, negative_prompt, seed, randomize_seed, guidance_scale, num_inference_steps, controlnet_conditioning_scale, control_guidance_end, strength],
        outputs = [result, seed_used]
    )

    depth_button.click(
        fn = infer_depth,
        inputs = [prompt, image, negative_prompt, seed, randomize_seed, guidance_scale, num_inference_steps, controlnet_conditioning_scale, control_guidance_end, strength],
        outputs = [result, seed_used]
    )

    pose_button.click(
        fn = infer_pose,
        inputs = [prompt, image, negative_prompt, seed, randomize_seed, guidance_scale, num_inference_steps, controlnet_conditioning_scale, control_guidance_end, strength],
        outputs = [result, seed_used]
    )

Kolors.queue().launch(debug=True)