|
import argparse |
|
import time |
|
import torch |
|
from diffusers import PixArtAlphaPipeline |
|
from diffusers.pipelines.flux import FluxPriorReduxPipeline |
|
from diffusers.pipelines.flux.modeling_flux import ReduxImageEncoder |
|
from transformers import SiglipImageProcessor |
|
from pathlib import Path |
|
from PIL import Image |
|
|
|
pipe = None |
|
redux = None |
|
redux_embedder = None |
|
|
|
def generate(prompt, image_prompt=None, guidance_scale=2, num_images=4, resolution=512): |
|
with torch.no_grad(): |
|
clip_image_processor = SiglipImageProcessor(size={"height": 384, "width": 384}) |
|
clip_pixel_values = clip_image_processor.preprocess( |
|
image_prompt.convert("RGB"), return_tensors="pt" |
|
).pixel_values.to("cuda", dtype=torch.bfloat16) |
|
|
|
image_prompt_latents = redux.image_encoder(clip_pixel_values).last_hidden_state |
|
image_prompt_embeds = redux_embedder(image_prompt_latents).image_embeds |
|
prompt_embeds = image_prompt_embeds[:, :120, :] |
|
attention_mask = torch.ones(prompt_embeds.shape[0], prompt_embeds.shape[1]).to("cuda") |
|
|
|
images = pipe( |
|
prompt_embeds=prompt_embeds, |
|
prompt_attention_mask=attention_mask, |
|
negative_prompt="", |
|
height=resolution, |
|
width=resolution, |
|
guidance_scale=guidance_scale, |
|
num_images_per_prompt=num_images, |
|
).images |
|
|
|
|
|
widths, heights = zip(*[img.size for img in images]) |
|
total_width = sum(widths) + len(images) - 1 |
|
max_height = max(heights) |
|
out = Image.new('RGB', (total_width, max_height)) |
|
x_offset = 0 |
|
for img in images: |
|
out.paste(img, (x_offset, 0)) |
|
x_offset += img.width + 1 |
|
|
|
|
|
if image_prompt is not None: |
|
out_with_image_prompt = Image.new('RGB', (out.width, out.height + 1 + resolution)) |
|
resized_prompt = image_prompt.resize((resolution, resolution), Image.Resampling.BILINEAR) |
|
out_with_image_prompt.paste(resized_prompt, (0, 0)) |
|
out_with_image_prompt.paste(out, (0, resolution + 1)) |
|
out = out_with_image_prompt |
|
|
|
Path("image-outputs").mkdir(parents=True, exist_ok=True) |
|
output_filename = f"image-outputs/{prompt[:40].replace(' ', '_')}.{int(time.time())}.png" |
|
out.save(output_filename) |
|
print(f"Saved output to {output_filename}") |
|
|
|
def main(): |
|
parser = argparse.ArgumentParser( |
|
description="Generate images using an image and a text prompt (PixArt Custom Redux)." |
|
) |
|
parser.add_argument("--prompt", type=str, default="", |
|
help='The text prompt for image generation (default: "")') |
|
parser.add_argument("--image_prompt", type=str, default=None, |
|
help="Path to an optional image to use as a prompt") |
|
parser.add_argument("--guidance_scale", type=float, default=2, |
|
help="Guidance scale for image generation (default: 2)") |
|
parser.add_argument("--num_images", type=int, default=4, |
|
help="Number of images to generate (default: 4)") |
|
parser.add_argument("--resolution", type=int, default=512, |
|
help="Resolution for generated images (default: 512)") |
|
args = parser.parse_args() |
|
|
|
global pipe, redux, redux_embedder |
|
pipe = PixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-512x512", torch_dtype=torch.bfloat16) |
|
redux_embedder = ReduxImageEncoder.from_pretrained("pixart-custom-redux", torch_dtype=torch.bfloat16) |
|
redux = FluxPriorReduxPipeline.from_pretrained("FLUX.1-Redux-dev", image_embedder=redux_embedder, torch_dtype=torch.bfloat16) |
|
|
|
pipe.to("cuda") |
|
redux.to("cuda") |
|
|
|
img_prompt = Image.open(args.image_prompt) if args.image_prompt else None |
|
generate(args.prompt, image_prompt=img_prompt, guidance_scale=args.guidance_scale, |
|
num_images=args.num_images, resolution=args.resolution) |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|