import torch
from pipeline import PixArtSigmaPipeline
import gradio as gr
import spaces
# Load the pre-trained diffusion model
base_model = "ptx0/pixart-900m-1024-ft-v0.7-stage1"
stg2_model = "ptx0/pixart-900m-1024-ft-v0.7-stage2"
torch_device = "cuda"
base_pipeline = PixArtSigmaPipeline.from_pretrained(
base_model, use_safetensors=True
).to(dtype=torch_precision, device=torch_device)
stg2_pipeline = PixArtSigmaPipeline.from_pretrained(stg2_model, **base_pipeline.components)
stg2_pipeline.transformer = PixArtTransformer2DModel.from_pretrained(stg2_model, subfolder="transformer").to(dtype=torch_precision, device=torch_device)
import re
def extract_resolution(resolution_str):
match = re.match(r'(\d+)x(\d+)', resolution_str)
if match:
width = int(match.group(1))
height = int(match.group(2))
return (width, height)
else:
return None
# Define the image generation function with adjustable parameters and a progress bar
@spaces.GPU
def generate(prompt, guidance_scale, num_inference_steps, resolution, negative_prompt):
width, height = extract_resolution(resolution) or (1024, 1024)
mixture_generator = torch.Generator().manual_seed(444)
stage1_strength = 0.6
latent_images = base_pipeline(
prompt=prompt,
negative_prompt=negative_prompt,
num_inference_steps=num_inference_steps,
num_images_per_prompt=1,
generator=mixture_generator,
guidance_scale=guidance_scale,
output_type="latent",
denoising_end=stage1_strength,
width=width,
height=height
).images
return stg2_pipeline(
prompt=prompt,
negative_prompt=negative_prompt,
latents=latent_images,
num_inference_steps=num_inference_steps,
num_images_per_prompt=1,
generator=mixture_generator,
guidance_scale=guidance_scale,
denoising_start=stage1_strength
).images
# Example prompts to demonstrate the model's capabilities
example_prompts = [
["A futuristic cityscape at night under a starry sky", 7.5, 25, "blurry, overexposed"],
["A serene landscape with a flowing river and autumn trees", 8.0, 20, "crowded, noisy"],
["An abstract painting of joy and energy in bright colors", 9.0, 30, "dark, dull"]
]
# Create a Gradio interface, 1024x1024,1152x960,896x1152
iface = gr.Interface(
fn=generate,
inputs=[
gr.Text(label="Enter your prompt"),
gr.Slider(1, 20, step=0.1, label="Guidance Scale", value=3.4),
gr.Slider(1, 50, step=1, label="Number of Inference Steps", value=28),
gr.Radio(["1024x1024", "1152x960", "896x1152"], label="Resolution", value="1152x960"),
gr.Text(value="underexposed, blurry, ugly, washed-out", label="Negative Prompt")
],
outputs=gr.Gallery(height=1024, min_width=1024, columns=2),
examples=example_prompts,
title="PixArt 900M",
description=(
"This is a two-stage mixture-of-experts model implemented in the spirit of NVIDIA's E-Diffi model."
"
The weights were initialised from terminusresearch/pixart-900m-1024-ft-v0.6 and trained separately on timestep ranges 999-400 and 400-0."
"
This results in two models where the first stage is responsible for most of the image's composition and colour, and the second stage handles minor-to-fine details."
"
"
"
In comparison to SDXL's refiner, the second stage here handles twice as many timesteps, which allows it to make more use of the text-conditional guidance, improving its capabilities."
"
"
"
Despite being trained with 40% of the schedule, you will discover that using stage 2 stand-alone as a refiner (img2img) will need half the strength - about 20%."
"
When being used in the two-stage pipeline, it should be configured to handle all of its 40% range."
"
"
"
This model is funded and trained by Terminus Research Group."
" If you would like to collaborate or provide compute, please see the organisation page for how to locate us on Discord."
"
"
"
"
"