import torch
from diffusers import UnCLIPScheduler, DDPMScheduler, StableUnCLIPPipeline
from diffusers.models import PriorTransformer
from transformers import CLIPTokenizer, CLIPTextModelWithProjection


def init_text2img_pipe():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    data_type = torch.float16 if torch.cuda.is_available() else torch.float32

    prior_model_id = "kakaobrain/karlo-v1-alpha"
    prior = PriorTransformer.from_pretrained(prior_model_id, subfolder="prior", torch_dtype=data_type)

    prior_text_model_id = "openai/clip-vit-large-patch14"
    prior_tokenizer = CLIPTokenizer.from_pretrained(prior_text_model_id)
    prior_text_model = CLIPTextModelWithProjection.from_pretrained(prior_text_model_id, torch_dtype=data_type)
    prior_scheduler = UnCLIPScheduler.from_pretrained(prior_model_id, subfolder="prior_scheduler")
    prior_scheduler = DDPMScheduler.from_config(prior_scheduler.config)

    stable_unclip_model_id = "stabilityai/stable-diffusion-2-1-unclip-small"

    pipe = StableUnCLIPPipeline.from_pretrained(
        stable_unclip_model_id,
        torch_dtype=data_type,
        variant="fp16",
        prior_tokenizer=prior_tokenizer,
        prior_text_encoder=prior_text_model,
        prior=prior,
        prior_scheduler=prior_scheduler,
    )
    return pipe.to(device)


def predict(prompt: str, negative_prompt: str, pipeline):
    return pipeline(prompt=prompt,
                    negative_prompt=negative_prompt,
                    height=600,
                    width=400,
                    num_inference_steps=60).images


if __name__ == "__main__":
    text2img_pipeline = init_text2img_pipe()
    images = predict("a dog", "a cat", text2img_pipeline)
    for idx, image in enumerate(images):
        image.save(f"/root/autodl-tmp/image_{idx}.png")