import argparse
import time
import torch
from diffusers import FluxTransformer2DModel
from transformers import CLIPModel
from pathlib import Path
from PIL import Image
from open_flux_pipeline import FluxWithCFGPipeline

pipe = None

def generate(prompt, image_prompt=None, guidance_scale=2, num_images=4, resolution=512):
    # Create blank image prompt backgrounds
    image_prompt_kwargs = {
        "image_prompt": Image.new("RGB", (resolution, resolution)),
        "negative_image_prompt": Image.new("RGB", (resolution, resolution)),
    }
    if image_prompt is not None:
        image_prompt_kwargs["image_prompt"] = image_prompt

    with torch.no_grad():
        images = pipe(
            prompt=prompt,
            negative_prompt="",
            height=resolution,
            width=resolution,
            max_sequence_length=256,
            guidance_scale=guidance_scale,
            num_images_per_prompt=num_images,
            **image_prompt_kwargs
        ).images

        # Concatenate all images horizontally
        widths, heights = zip(*[img.size for img in images])
        total_width = sum(widths) + len(images) - 1
        max_height = max(heights)
        out = Image.new('RGB', (total_width, max_height))
        x_offset = 0
        for img in images:
            out.paste(img, (x_offset, 0))
            x_offset += img.width + 1

        # If an image prompt was provided, stack it above the generated images
        if image_prompt is not None:
            out_with_image_prompt = Image.new('RGB', (out.width, out.height + 1 + resolution))
            resized_prompt = image_prompt.resize((resolution, resolution), Image.Resampling.BILINEAR)
            out_with_image_prompt.paste(resized_prompt, (0, 0))
            out_with_image_prompt.paste(out, (0, resolution + 1))
            out = out_with_image_prompt

    # Ensure the output directory exists and save the final image
    Path("image-outputs").mkdir(parents=True, exist_ok=True)
    output_filename = f"image-outputs/{prompt[:40].replace(' ', '_')}.{int(time.time())}.png"
    out.save(output_filename)
    print(f"Saved output to {output_filename}")

def main():
    parser = argparse.ArgumentParser(description="Generate images using an image and a text prompt (Flux Image Variations).")
    parser.add_argument("--prompt", type=str, default="", help='The text prompt for image generation (default "")')
    parser.add_argument("--image_prompt", type=str, default=None,
                        help="Path to an optional image to use as a prompt")
    parser.add_argument("--guidance_scale", type=float, default=2,
                        help="Guidance scale for image generation (default: 2)")
    parser.add_argument("--num_images", type=int, default=4,
                        help="Number of images to generate (default: 4)")
    parser.add_argument("--resolution", type=int, default=512,
                        help="Resolution for generated images (default: 512)")
    args = parser.parse_args()

    # Load models and pipelines
    global pipe
    clip = CLIPModel.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=torch.bfloat16)
    pipe = FluxWithCFGPipeline.from_pretrained("ostris/OpenFLUX.1", text_encoder=clip, transformer=None, torch_dtype=torch.bfloat16)
    pipe.transformer = FluxTransformer2DModel.from_pretrained("flux-image-variations-model", torch_dtype=torch.bfloat16)
    pipe.to("cuda")

    img_prompt = Image.open(args.image_prompt) if args.image_prompt else None
    generate(args.prompt, image_prompt=img_prompt, guidance_scale=args.guidance_scale,
             num_images=args.num_images, resolution=args.resolution)

if __name__ == "__main__":
    main()