import torch from pipeline import PixArtSigmaPipeline from diffusers.models import PixArtTransformer2DModel import gradio as gr import spaces # Load the pre-trained diffusion model base_model = "ptx0/pixart-900m-1024-ft-v0.7-stage1" stg2_model = "ptx0/pixart-900m-1024-ft-v0.7-stage2" torch_device = "cuda" torch_precision = torch.bfloat16 base_pipeline = PixArtSigmaPipeline.from_pretrained( base_model, use_safetensors=True ).to(dtype=torch_precision, device=torch_device) stg2_pipeline = PixArtSigmaPipeline.from_pretrained(stg2_model, **base_pipeline.components) stg2_pipeline.transformer = PixArtTransformer2DModel.from_pretrained(stg2_model, subfolder="transformer").to(dtype=torch_precision, device=torch_device) import re def extract_resolution(resolution_str): match = re.match(r'(\d+)x(\d+)', resolution_str) if match: width = int(match.group(1)) height = int(match.group(2)) return (width, height) else: return None # Define the image generation function with adjustable parameters and a progress bar @spaces.GPU def generate(prompt, stage1_guidance_scale, stage2_guidance_scale, num_inference_steps, resolution, negative_prompt): width, height = extract_resolution(resolution) or (1024, 1024) mixture_generator = torch.Generator().manual_seed(444) stage1_strength = 0.6 latent_images = base_pipeline( prompt=prompt, negative_prompt=negative_prompt, num_inference_steps=num_inference_steps, num_images_per_prompt=1, generator=mixture_generator, guidance_scale=stage1_guidance_scale, output_type="latent", denoising_end=stage1_strength, width=width, height=height ).images return stg2_pipeline( prompt=prompt, negative_prompt=negative_prompt, latents=latent_images, num_inference_steps=num_inference_steps, num_images_per_prompt=1, generator=mixture_generator, guidance_scale=stage2_guidance_scale, denoising_start=stage1_strength ).images # Example prompts to demonstrate the model's capabilities example_prompts = [ [ "A futuristic cityscape at night under a starry sky", 3.5, 4.5, 25, "1152x960", "blurry, overexposed" ], [ "A serene landscape with a flowing river and autumn trees", 3.0, 4.0, 20, "1152x960", "crowded, noisy" ], [ "An abstract painting of joy and energy in bright colors", 3.0, 4.5, 30, "896x1152", "dark, dull" ], [ "a stunning portrait of a hamster with an eye patch, piloting a miniature cessna on a wooden desk in an office, depth of field, bokeh, sharp, f1.4", 3.2, 4.6, 40, "1024x1024", "this is an ugly photograph that no one liked" ], [ "Check out my cousin larry in his dirty room, he is such a damn mess", 3.2, 4.6, 40, "1152x960", "the photograph is blurry and unremarkable" ] ] # Create a Gradio interface, 1024x1024,1152x960,896x1152 iface = gr.Interface( fn=generate, inputs=[ gr.Text(label="Enter your prompt"), gr.Slider(1, 20, step=0.1, label="Guidance Scale (Stage I)", value=3.4), gr.Slider(1, 20, step=0.1, label="Guidance Scale (Stage II)", value=4.2), gr.Slider(1, 50, step=1, label="Number of Inference Steps", value=35), gr.Radio(["1024x1024", "1152x960", "896x1152"], label="Resolution", value="1024x1024"), gr.Text(value="underexposed, blurry, ugly, washed-out", label="Negative Prompt") ], outputs=gr.Gallery(height=1024, min_width=1024, columns=2), examples=example_prompts, title="PixArt 900M", description=( "This is a two-stage mixture-of-experts model implemented in the spirit of NVIDIA's E-Diffi model." "
The weights were initialised from terminusresearch/pixart-900m-1024-ft-v0.6 and trained separately on timestep ranges 999-400 and 400-0." "
This results in two models where the first stage is responsible for most of the image's composition and colour, and the second stage handles minor-to-fine details." "
" "
In comparison to SDXL's refiner, the second stage here handles twice as many timesteps, which allows it to make more use of the text-conditional guidance, improving its capabilities." "
" "
Despite being trained with 40% of the schedule, you will discover that using stage 2 stand-alone as a refiner (img2img) will need half the strength - about 20%." "
When being used in the two-stage pipeline, it should be configured to handle all of its 40% range." "
" "
This model is funded and trained by Terminus Research Group." " If you would like to collaborate or provide compute, please see the organisation page for how to locate us on Discord." "
" "
" "" ) ).launch()