PixArt-900M-EDiffi

Running on Zero

PseudoTerminal X

Update app.py

62dff5f verified 7 months ago

5.25 kB

	import torch
	from pipeline import PixArtSigmaPipeline
	from diffusers.models import PixArtTransformer2DModel
	import gradio as gr
	import spaces

	# Load the pre-trained diffusion model
	base_model = "ptx0/pixart-900m-1024-ft-v0.7-stage1"
	stg2_model = "ptx0/pixart-900m-1024-ft-v0.7-stage2"
	torch_device = "cuda"
	torch_precision = torch.bfloat16
	base_pipeline = PixArtSigmaPipeline.from_pretrained(
	base_model, use_safetensors=True
	).to(dtype=torch_precision, device=torch_device)
	stg2_pipeline = PixArtSigmaPipeline.from_pretrained(stg2_model, **base_pipeline.components)
	stg2_pipeline.transformer = PixArtTransformer2DModel.from_pretrained(stg2_model, subfolder="transformer").to(dtype=torch_precision, device=torch_device)
	import re

	def extract_resolution(resolution_str):
	match = re.match(r'(\d+)x(\d+)', resolution_str)
	if match:
	width = int(match.group(1))
	height = int(match.group(2))
	return (width, height)
	else:
	return None

	# Define the image generation function with adjustable parameters and a progress bar
	@spaces.GPU
	def generate(prompt, stage1_guidance_scale, stage2_guidance_scale, num_inference_steps, resolution, negative_prompt):
	width, height = extract_resolution(resolution) or (1024, 1024)
	mixture_generator = torch.Generator().manual_seed(444)
	stage1_strength = 0.6
	latent_images = base_pipeline(
	prompt=prompt,
	negative_prompt=negative_prompt,
	num_inference_steps=num_inference_steps,
	num_images_per_prompt=1,
	generator=mixture_generator,
	guidance_scale=stage1_guidance_scale,
	output_type="latent",
	denoising_end=stage1_strength,
	width=width,
	height=height
	).images
	return stg2_pipeline(
	prompt=prompt,
	negative_prompt=negative_prompt,
	latents=latent_images,
	num_inference_steps=num_inference_steps,
	num_images_per_prompt=1,
	generator=mixture_generator,
	guidance_scale=stage2_guidance_scale,
	denoising_start=stage1_strength
	).images

	# Example prompts to demonstrate the model's capabilities
	example_prompts = [
	[
	"A futuristic cityscape at night under a starry sky",
	3.5,
	4.5,
	25,
	"1152x960",
	"blurry, overexposed"
	],
	[
	"A serene landscape with a flowing river and autumn trees",
	3.0,
	4.0,
	20,
	"1152x960",
	"crowded, noisy"
	],
	[
	"An abstract painting of joy and energy in bright colors",
	3.0,
	4.5,
	30,
	"896x1152",
	"dark, dull"
	],
	[
	"a stunning portrait of a hamster with an eye patch, piloting a miniature cessna on a wooden desk in an office, depth of field, bokeh, sharp, f1.4",
	3.2,
	4.6,
	40,
	"1024x1024",
	"this is an ugly photograph that no one liked"
	],
	[
	"Check out my cousin larry in his dirty room, he is such a damn mess",
	3.2,
	4.6,
	40,
	"1152x960",
	"the photograph is blurry and unremarkable"
	]
	]
	# Create a Gradio interface, 1024x1024,1152x960,896x1152
	iface = gr.Interface(
	fn=generate,
	inputs=[
	gr.Text(label="Enter your prompt"),
	gr.Slider(1, 20, step=0.1, label="Guidance Scale (Stage I)", value=3.4),
	gr.Slider(1, 20, step=0.1, label="Guidance Scale (Stage II)", value=4.2),
	gr.Slider(1, 50, step=1, label="Number of Inference Steps", value=35),
	gr.Radio(["1024x1024", "1152x960", "896x1152"], label="Resolution", value="1024x1024"),
	gr.Text(value="underexposed, blurry, ugly, washed-out", label="Negative Prompt")
	],
	outputs=gr.Gallery(height=1024, min_width=1024, columns=2),
	examples=example_prompts,
	title="PixArt 900M",
	description=(
	"This is a two-stage mixture-of-experts model implemented in the spirit of NVIDIA's E-Diffi model."
	"<br />The weights were initialised from <strong>terminusresearch/pixart-900m-1024-ft-v0.6</strong> and trained separately on timestep ranges <strong>999-400</strong> and <strong>400-0</strong>."
	"<br />This results in two models where the first stage is responsible for most of the image's composition and colour, and the second stage handles minor-to-fine details."
	"<br />"
	"<br />In comparison to SDXL's refiner, the second stage here handles twice as many timesteps, which allows it to make more use of the text-conditional guidance, improving its capabilities."
	"<br />"
	"<br />Despite being trained with 40% of the schedule, you will discover that using stage 2 stand-alone as a refiner (img2img) will need half the strength - about 20%."
	"<br />When being used in the two-stage pipeline, it should be configured to handle all of its 40% range."
	"<br />"
	"<br />This model is funded and trained by <strong>Terminus Research Group</strong>."
	" If you would like to collaborate or provide compute, please see the organisation page for how to locate us on Discord."
	"<br />"
	"<br />"
	"<ul>"
	"<li>Lead trainer: @pseudoterminalx (bghira@GitHub)</li>"
	"</ul>"
	)
	).launch()