import argparse import os from PIL import Image import gradio as gr import spaces from imagenet_en_cn import IMAGENET_1K_CLASSES from omegaconf import OmegaConf from huggingface_hub import snapshot_download import torch from transformers import T5EncoderModel, AutoTokenizer from pixelflow.scheduling_pixelflow import PixelFlowScheduler from pixelflow.pipeline_pixelflow import PixelFlowPipeline from pixelflow.utils import config as config_utils from pixelflow.utils.misc import seed_everything parser = argparse.ArgumentParser(description='Gradio Demo', add_help=False) parser.add_argument('--checkpoint', type=str, help='checkpoint folder path') parser.add_argument('--class_cond', action='store_true', help='use class conditional generation') args = parser.parse_args() # deploy args.checkpoint = "pixelflow_t2i" args.class_cond = False output_dir = args.checkpoint if args.class_cond: if not os.path.exists(output_dir): snapshot_download(repo_id="ShoufaChen/PixelFlow-Class2Image", local_dir=output_dir) config = OmegaConf.load(f"{output_dir}/config.yaml") model = config_utils.instantiate_from_config(config.model) print(f"Num of parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}") ckpt = torch.load(f"{output_dir}/model.pt", map_location="cpu", weights_only=True) text_encoder = None tokenizer = None resolution = 256 NUM_EXAMPLES = 4 else: if not os.path.exists(output_dir): snapshot_download(repo_id="ShoufaChen/PixelFlow-Text2Image", local_dir=output_dir) config = OmegaConf.load(f"{output_dir}/config.yaml") model = config_utils.instantiate_from_config(config.model) print(f"Num of parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}") ckpt = torch.load(f"{output_dir}/model.pt", map_location="cpu", weights_only=True) text_encoder = T5EncoderModel.from_pretrained("google/flan-t5-xl") tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xl") resolution = 1024 NUM_EXAMPLES = 1 model.load_state_dict(ckpt, strict=True) model.eval() print(f"outside space.GPU. {torch.cuda.is_available()=}") if torch.cuda.is_available(): model = model.cuda() text_encoder = text_encoder.cuda() if text_encoder else None device = torch.device("cuda") else: raise ValueError("No GPU") scheduler = PixelFlowScheduler(config.scheduler.num_train_timesteps, num_stages=config.scheduler.num_stages, gamma=-1/3) pipeline = PixelFlowPipeline( scheduler, model, text_encoder=text_encoder, tokenizer=tokenizer, max_token_length=512, ) @spaces.GPU(duration=120) def infer(noise_shift, cfg_scale, class_label, seed, *num_steps_per_stage): print(f"inside space.GPU. {torch.cuda.is_available()=}") seed_everything(seed) with torch.autocast("cuda", dtype=torch.bfloat16), torch.no_grad(): samples = pipeline( prompt=[class_label] * NUM_EXAMPLES, height=resolution, width=resolution, num_inference_steps=list(num_steps_per_stage), guidance_scale=cfg_scale, # The guidance for the first frame, set it to 7 for 384p variant device=device, shift=noise_shift, use_ode_dopri5=False, ) samples = (samples * 255).round().astype("uint8") samples = [Image.fromarray(sample) for sample in samples] return samples css = """ h1 { text-align: center; display: block; } .follow-link { margin-top: 0.8em; font-size: 1em; text-align: center; } """ with gr.Blocks(css=css) as demo: gr.Markdown("# PixelFlow: Pixel-Space Generative Models with Flow") gr.HTML("""