Spaces:

Ryukijano
/

Fastest-image-generation

Runtime error

App Files Files Community

Ryukijano commited on Dec 9, 2024

Commit

431e45c

verified ·

1 Parent(s): c8f2370

Upload 2 files

Browse files

Files changed (2) hide show

app.py +9 -17
custom_pipeline.py +22 -50

app.py CHANGED Viewed

@@ -8,10 +8,7 @@ from diffusers import DiffusionPipeline, AutoencoderTiny
 from diffusers.models.attention_processor import AttnProcessor2_0
 from custom_pipeline import FluxWithCFGPipeline
-# Enable TF32 and set Tensor Core precision
 torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cudnn.allow_tf32 = True
-torch.set_float32_matmul_precision('high')
 # Constants
 MAX_SEED = np.iinfo(np.int32).max
@@ -32,10 +29,6 @@ pipe.set_adapters(["better"], adapter_weights=[1.0])
 pipe.fuse_lora(adapter_name=["better"], lora_scale=1.0)
 pipe.unload_lora_weights()
-# Memory optimizations (optional, uncomment if needed)
-# pipe.enable_model_cpu_offload()
-# pipe.enable_sequential_cpu_offload()
 torch.cuda.empty_cache()
 # Inference function
@@ -47,15 +40,14 @@ def generate_image(prompt, seed=24, width=DEFAULT_WIDTH, height=DEFAULT_HEIGHT,
     start_time = time.time()
-    with torch.autocast(device_type="cuda", dtype=torch.float16):
-        # Only generate the last image in the sequence
-        img = pipe.generate_images(
-                prompt=prompt,
-                width=width,
-                height=height,
-                num_inference_steps=num_inference_steps,
-                generator=generator
-            )
     latency = f"Latency: {(time.time()-start_time):.2f} seconds"
     return img, seed, latency
@@ -171,4 +163,4 @@ with gr.Blocks() as demo:
         )
 # Launch the app
-demo.launch()

 from diffusers.models.attention_processor import AttnProcessor2_0
 from custom_pipeline import FluxWithCFGPipeline
 torch.backends.cuda.matmul.allow_tf32 = True
 # Constants
 MAX_SEED = np.iinfo(np.int32).max
 pipe.fuse_lora(adapter_name=["better"], lora_scale=1.0)
 pipe.unload_lora_weights()
 torch.cuda.empty_cache()
 # Inference function
     start_time = time.time()
+    # Only generate the last image in the sequence
+    img = pipe.generate_images(
+            prompt=prompt,
+            width=width,
+            height=height,
+            num_inference_steps=num_inference_steps,
+            generator=generator
+        )
     latency = f"Latency: {(time.time()-start_time):.2f} seconds"
     return img, seed, latency
         )
 # Launch the app
+demo.launch()

custom_pipeline.py CHANGED Viewed

@@ -130,57 +130,29 @@ class FluxWithCFGPipeline(FluxPipeline):
         # Handle guidance
         guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float16).expand(latents.shape[0]) if self.transformer.config.guidance_embeds else None
-        # static method that can be jitted
-        @staticmethod
-        @torch.jit.script
-        def _denoising_loop_static(latents, timesteps, pooled_prompt_embeds, prompt_embeds, text_ids, latent_image_ids, guidance, transformer, scheduler):
-            for i, t in enumerate(timesteps):
-                timestep = t.expand(latents.shape[0]).to(latents.dtype)
-                noise_pred = transformer(
-                    hidden_states=latents,
-                    timestep=timestep / 1000,
-                    guidance=guidance,
-                    pooled_projections=pooled_prompt_embeds,
-                    encoder_hidden_states=prompt_embeds,
-                    txt_ids=text_ids,
-                    img_ids=latent_image_ids,
-                    return_dict=False,
-                )[0]
-                latents = scheduler.step(noise_pred, t, latents, return_dict=False)[0]
-                torch.cuda.empty_cache()
-            return latents
-        # Make the core denoising loop a static method
-        self._denoising_loop = torch.cuda.make_graphed_callables(
-            _denoising_loop_static,
-            (
-                latents.clone(),  # Example inputs for warmup
-                timesteps.clone(),
-                pooled_prompt_embeds.clone(),
-                prompt_embeds.clone(),
-                text_ids.clone(),
-                latent_image_ids.clone(),
-                guidance.clone(),
-                self.transformer,
-                self.scheduler
-            )
-        )
-        # Call the static method now
-        latents = self._denoising_loop(
-            latents,
-            timesteps,
-            pooled_prompt_embeds,
-            prompt_embeds,
-            text_ids,
-            latent_image_ids,
-            guidance,
-            self.transformer,
-            self.scheduler
-        )
         # Final image
         return self._decode_latents_to_image(latents, height, width, output_type)

         # Handle guidance
         guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float16).expand(latents.shape[0]) if self.transformer.config.guidance_embeds else None
+        # 6. Denoising loop
+        for i, t in enumerate(timesteps):
+            if self.interrupt:
+                continue
+            timestep = t.expand(latents.shape[0]).to(latents.dtype)
+            noise_pred = self.transformer(
+                hidden_states=latents,
+                timestep=timestep / 1000,
+                guidance=guidance,
+                pooled_projections=pooled_prompt_embeds,
+                encoder_hidden_states=prompt_embeds,
+                txt_ids=text_ids,
+                img_ids=latent_image_ids,
+                joint_attention_kwargs=self.joint_attention_kwargs,
+                return_dict=False,
+            )[0]
+             # Yield intermediate result
+            latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+            torch.cuda.empty_cache()
         # Final image
         return self._decode_latents_to_image(latents, height, width, output_type)